From ce82bd0117bd3598300b3a089d13d401b90279c7 Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Mon, 23 Jun 2025 15:30:51 -0300 Subject: [PATCH 01/54] ci: add workflow for relocatable cmake package (#14346) --- .github/workflows/build-cmake-pkg.yml | 51 +++++++++++++++++++++++++++ .github/workflows/build.yml | 40 +++++++++++++++++++-- 2 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/build-cmake-pkg.yml diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml new file mode 100644 index 000000000..fee2ab96b --- /dev/null +++ b/.github/workflows/build-cmake-pkg.yml @@ -0,0 +1,51 @@ +name: Build relocatable cmake package +on: + workflow_dispatch: + workflow_call: + +jobs: + linux: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y build-essential tcl + + - name: Build + run: | + PREFIX="$(pwd)"/inst + cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \ + -DLLAMA_CURL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release + cmake --build build --config Release + cmake --install build --prefix "$PREFIX" --config Release + + export LLAMA_CONFIG="$PREFIX"/lib/cmake/llama/llama-config.cmake + tclsh <<'EOF' + set build(commit) [string trim [exec git rev-parse --short HEAD]] + set build(number) [string trim [exec git rev-list --count HEAD]] + set build(version) "0.0.$build(number)" + + set llamaconfig [read [open "$env(LLAMA_CONFIG)" r]] + set checks [list "set\\(LLAMA_VERSION \\s+$build(version)\\)" \ + "set\\(LLAMA_BUILD_COMMIT\\s+$build(commit)\\)" \ + "set\\(LLAMA_BUILD_NUMBER\\s+$build(number)\\)"] + + puts -nonewline "Checking llama-config.cmake version... " + foreach check $checks { + if {![regexp -expanded -- $check $llamaconfig]} { + puts "\"$check\" failed!" + exit 1 + } + } + puts "success." + EOF + + cd examples/simple-cmake-pkg + cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX"/lib/cmake + cmake --build build diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index be2828973..4feccf21e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -5,10 +5,43 @@ on: push: branches: - master - paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] + paths: [ + '.github/workflows/build.yml', + '.github/workflows/build-linux-cross.yml', + '.github/workflows/build-cmake-pkg.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + '**/*.swift', + '**/*.m', + '**/*.metal', + '**/*.comp' + ] + pull_request: types: [opened, synchronize, reopened] - paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/.cmake', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp'] + paths: [ + '.github/workflows/build.yml', + '.github/workflows/build-linux-cross.yml', + '.github/workflows/build-cmake-pkg.yml', + '**/CMakeLists.txt', + '**/.cmake', + '**/*.h', + '**/*.hpp', + '**/*.c', + '**/*.cpp', + '**/*.cu', + '**/*.cuh', + '**/*.swift', + '**/*.m', + '**/*.metal', + '**/*.comp' + ] concurrency: group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} @@ -478,6 +511,9 @@ jobs: build-linux-cross: uses: ./.github/workflows/build-linux-cross.yml + build-cmake-pkg: + uses: ./.github/workflows/build-cmake-pkg.yml + macOS-latest-cmake-ios: runs-on: macos-latest From 0142961a2e67909e33cdf410274b56c08c5dce7a Mon Sep 17 00:00:00 2001 From: uvos Date: Tue, 24 Jun 2025 01:12:56 +0200 Subject: [PATCH 02/54] CUDA/HIP: optimize mmv paths taken for HIP devices (#14324) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/common.cuh | 6 +++++- ggml/src/ggml-cuda/mmv.cu | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1369bc2d9..f6127aeee 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -263,7 +263,11 @@ static bool fp16_mma_hardware_available(const int cc) { } static bool bf16_mma_hardware_available(const int cc) { - return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE; + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; +} + +static bool fp32_mma_hardware_available(const int cc) { + return GGML_CUDA_CC_IS_CDNA(cc); } // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index 1502e9d94..e14c93516 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -456,6 +456,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return ne11 <= 4; } return ne11 <= 3; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (fp32_mma_hardware_available(cc)) { + return ne11 <= 3; + } + return ne11 <= 8; } return ne11 <= 8; case GGML_TYPE_F16: @@ -468,6 +473,14 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return src0_small && ne11 <= 3; } return ne11 <= 8; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (fp16_mma_hardware_available(cc)) { + if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { + return ne11 <= 5; + } + return ne11 <= 2; + } + return ne11 <= 8; } return ne11 <= 8; case GGML_TYPE_BF16: @@ -480,6 +493,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return src0_small && ne11 <= 3; } return ne11 <= 8; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (bf16_mma_hardware_available(cc)) { + return ne11 <= 3; + } + return ne11 <= 8; } return ne11 <= 8; default: From 901e20bbe571fbde48d13eb188f4e7cdc7562fb6 Mon Sep 17 00:00:00 2001 From: Bartowski <3266127+bartowski1182@users.noreply.github.com> Date: Tue, 24 Jun 2025 02:17:58 -0400 Subject: [PATCH 03/54] jinja : Add Mistral-Small-3.2-24B-Instruct-2506.jinja (#14349) This will allow the use of tools on the llama-server --- .../Mistral-Small-3.2-24B-Instruct-2506.jinja | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja diff --git a/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja b/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja new file mode 100644 index 000000000..19a3eaee4 --- /dev/null +++ b/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja @@ -0,0 +1,124 @@ +{%- set today = strftime_now("%Y-%m-%d") %} +{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information or when the user's request requires up-to-date or specific data, you must use the available tools to fetch the information. Do not hesitate to use tools whenever they can provide a more accurate or complete response. If no relevant tools are available, then clearly state that you don't have the information and avoid making up anything. + +If the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\"). +You are always very attentive to dates, and when asked about information at specific dates, you discard information that is at another date. +You follow these instructions in all languages, and always respond to the user in the language they use or request. +Next sections describe the capabilities that you have. + +# WEB BROWSING INSTRUCTIONS + +You cannot perform any web search or access internet to open URLs, links etc. If it seems like the user is expecting you to do so, you clarify the situation and ask the user to copy paste the text directly in the chat. + +# MULTI-MODAL INSTRUCTIONS + +You have the ability to read images, but you cannot generate images. You also cannot transcribe audio files or videos. +You cannot read nor transcribe audio files or videos. + +# TOOL CALLING INSTRUCTIONS + +You may have access to tools that you can use to fetch information or perform actions. You must use these tools in the following situations: + +1. When the request requires up-to-date information. +2. When the request requires specific data that you do not have in your knowledge base. +3. When the request involves actions that you cannot perform without tools. + +Always prioritize using tools to provide the most accurate and helpful response. If tools are not available, inform the user that you cannot perform the requested action at the moment." %} + +{{- bos_token }} + +{%- set system_prompt = default_system_message %} +{%- set loop_messages = messages %} + +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{%- if messages|length > 0 and messages[0]['role'] == 'system' %} + {%- if messages[0]['content'] is string %} + {%- set system_prompt = messages[0]['content'] %} + {%- else %} + {%- set system_prompt = messages[0]['content'][0]['text'] %} + {%- endif %} + {%- set loop_messages = messages[1:] %} +{%- endif %} + +{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %} + +{%- set ns = namespace(index=0) %} +{%- for message in loop_messages %} + {%- if not (message.role == "tool" or (message.get('tool_calls'))) %} + {%- if (message["role"] == "user") != (ns.index % 2 == 0) %} + {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }} + {%- endif %} + {%- set ns.index = ns.index + 1 %} + {%- endif %} +{%- endfor %} + +{{- '[SYSTEM_PROMPT]' + system_prompt + '[/SYSTEM_PROMPT]' }} + +{%- for message in loop_messages %} + {%- if message['role'] == 'system' %} + {%- if message['content'] is string %} + {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }} + {%- else %} + {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }} + {%- endif %} + {%- elif message['role'] == 'user' %} + {%- if tools is not none and (message == user_messages[-1]) %} + {{- '[AVAILABLE_TOOLS]' + tools|tojson + '[/AVAILABLE_TOOLS]' }} + {%- endif %} + {{- '[INST]' }} + {%- if message['content'] is string %} + {{- message['content'] }} + {%- else %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text and image blocks are supported in message content!') }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '[/INST]' }} + {%- elif message['role'] == 'assistant' %} + {%- if message.get('tool_calls') %} + {%- for tool_call in message.tool_calls %} + {{- '[TOOL_CALLS]' + tool_call.function.name }} + {%- if not tool_call.id is defined or tool_call.id is not string or tool_call.id|length != 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }} + {%- endif %} + {{- '[CALL_ID]' + tool_call.id }} + {{- '[ARGS]' + tool_call['function']['arguments']|tojson }} + {%- endfor %} + {{- eos_token }} + {%- elif message['content'] is string %} + {{- message['content'] + eos_token }} + {%- else %} + {%- for block in message['content'] %} + {%- if block['type'] == 'text' %} + {{- block['text'] }} + {%- elif block['type'] in ['image', 'image_url'] %} + {{- '[IMG]' }} + {%- else %} + {{- raise_exception('Only text and image blocks are supported in assistant content!') }} + {%- endif %} + {%- endfor %} + {{- eos_token }} + {%- endif %} + {%- elif message['role'] == 'tool_results' or message['role'] == 'tool' %} + {%- if message.content is defined and message.content.content is defined %} + {%- set content = message.content.content %} + {%- else %} + {%- set content = message.content %} + {%- endif %} + {%- if not message.tool_call_id is defined or message.tool_call_id is not string or message['tool_call_id']|length != 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }} + {%- endif %} + {{- '[TOOL_RESULTS]' + message.tool_call_id + '[TOOL_CONTENT]' + content|string + '[/TOOL_RESULTS]' }} + {%- else %} + {{- raise_exception('Only system, user, assistant, and tool roles are supported!') }} + {%- endif %} +{%- endfor %} From abf241045d09cad70dc797b0fba393ad09ee2cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Tue, 24 Jun 2025 09:31:00 +0200 Subject: [PATCH 04/54] main : honor --verbose-prompt on interactive prompts (#14350) --- tools/main/main.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 154b37cdb..516bf0965 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -917,10 +917,19 @@ int main(int argc, char ** argv) { embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); + if (params.verbose_prompt) { + LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size() - original_size); + } + for (size_t i = original_size; i < embd_inp.size(); ++i) { const llama_token token = embd_inp[i]; + const std::string token_str = common_token_to_piece(ctx, token); output_tokens.push_back(token); - output_ss << common_token_to_piece(ctx, token); + output_ss << token_str; + + if (params.verbose_prompt) { + LOG_INF("%6d -> '%s'\n", token, token_str.c_str()); + } } // reset assistant message From 1b809cee225222094a0ff5be8467240487ce4ae4 Mon Sep 17 00:00:00 2001 From: Nigel Bosch Date: Tue, 24 Jun 2025 08:59:11 +0000 Subject: [PATCH 05/54] server : move no API key doc to /health (#14352) --- tools/server/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/server/README.md b/tools/server/README.md index 43aa65d50..1a624c13b 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -370,6 +370,8 @@ node index.js ### GET `/health`: Returns heath check result +This endpoint is public (no API key check). + **Response format** - HTTP status code 503 @@ -708,7 +710,7 @@ If the tokens are missing, then the extra context is simply prefixed at the star ### **GET** `/props`: Get server global properties. -This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props` +By default, it is read-only. To make POST request to change global properties, you need to start server with `--props` **Response format** From c148cf1946275952a79ad50b6199725f12a70411 Mon Sep 17 00:00:00 2001 From: Mathieu Baudier Date: Tue, 24 Jun 2025 15:05:31 +0200 Subject: [PATCH 06/54] cmake : use LLAMA_BUILD_NUMBER when defining LLAMA_INSTALL_VERSION (#14362) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 50801cdc6..d2becb04c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,7 @@ endif() if (NOT DEFINED LLAMA_BUILD_COMMIT) set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT}) endif() -set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER}) +set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER}) # override ggml options set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) From 62af464227dafa1c55e0535bcb24346326748f46 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 24 Jun 2025 18:26:30 +0300 Subject: [PATCH 07/54] batch : fix check for empty sequences in memory (#14364) * batch : fix check for empty sequences in memory ggml-ci * cont : reuse the var ggml-ci --- src/llama-batch.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index 401e11364..91b1d6078 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -244,11 +244,13 @@ bool llama_batch_allocr::init( continue; } - if (memory) { + const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1; + + if (p0 >= 0) { bool ok = true; if (batch.token) { - if (seq_pos_min(s) != memory->seq_pos_max(s) + 1) { + if (seq_pos_min(s) != p0 + 1) { ok = false; } } else { @@ -256,7 +258,7 @@ bool llama_batch_allocr::init( // for embeddings (typically used as vision input), we allow them to have repeating positions // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762 - if (seq_pos_min(s) != memory->seq_pos_max(s) && seq_pos_min(s) != memory->seq_pos_max(s) + 1) { + if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) { ok = false; } } @@ -267,7 +269,7 @@ bool llama_batch_allocr::init( " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n" " - the tokens for sequence %d in the input batch have a starting position of Y = %d\n" " it is required that the sequence positions remain consecutive: Y = X + 1\n", - __func__, s, s, memory->seq_pos_max(s), s, seq_pos_min(s)); + __func__, s, s, p0, s, seq_pos_min(s)); return false; } From 73e53dc834c0a2336cd104473af6897197b96277 Mon Sep 17 00:00:00 2001 From: lhez Date: Tue, 24 Jun 2025 11:46:25 -0700 Subject: [PATCH 08/54] opencl: ref count `ggml_backend_opencl_context` and refactor profiling (#14254) * Move profiling info into `ggml_backend_opencl_context` * Add `enqueue_ndrange_kernel` to launch kernel --- ggml/src/ggml-opencl/ggml-opencl.cpp | 777 +++++++++------------------ 1 file changed, 241 insertions(+), 536 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 628e574f0..96e8a8588 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive return { type, major, minor, patch }; } +// Profiling +struct ProfilingInfo { + std::string op_name; + std::string kernel_name; + + cl_kernel kernel; + cl_event evt; + + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong overhead_start; + cl_ulong overhead_end; + // For the times below, see spec for clGetEventProfilingInfo + // The time kernel spent in cmd queue - SUBMIT - QUEUED + cl_ulong cmd_queued_duration_ns; + // The time kernel spent for submission - START - SUBMIT + cl_ulong cmd_submit_duration_ns; + // Kernel execution time in nanoseconds - END - START + cl_ulong cmd_duration_ns; + // The time for the kernel to complete - COMPLETE - END + cl_ulong cmd_complete_duration_ns; + // Total time to finish the kernel - COMPELTE - QUEUED + cl_ulong cmd_total_duration_ns; + // Global and local work sizes. + size_t global_size[3]; + size_t local_size[3]; + // Op output size. + size_t output_size[4]; +}; + +static void populateProfilingInfo( + ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim, + size_t global_size[3], size_t local_size[3], + const ggml_tensor * tensor) { + info.op_name = tensor->name; + info.kernel = kernel; + info.evt = evt; + + // 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose + info.local_size[0] = 0; + info.local_size[1] = 0; + info.local_size[2] = 0; + + info.global_size[0] = 0; + info.global_size[1] = 0; + info.global_size[2] = 0; + + if (local_size) { + for (cl_uint i = 0; i < work_dim; ++i) { + info.local_size[i] = local_size[i]; + } + } + + for (cl_uint i = 0; i < work_dim; ++i) { + info.global_size[i] = global_size[i]; + } + + info.output_size[0] = tensor->ne[0]; + info.output_size[1] = tensor->ne[1]; + info.output_size[2] = tensor->ne[2]; + info.output_size[3] = tensor->ne[3]; +} + struct ggml_backend_opencl_context; // backend device context @@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context { // backend context struct ggml_backend_opencl_context { + int ref_count; + cl_device_id device; std::string device_name; @@ -369,6 +436,108 @@ struct ggml_backend_opencl_context { cl_kernel kernel_timestep_embedding; cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat; + std::vector profiling_info; + + void write_profiling_info() { + FILE * fperf = fopen("cl_profiling.csv", "w"); + if (!fperf) { + GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); + return; + } + + // Populate profiling info + for (ProfilingInfo & info : profiling_info) { + cl_ulong cmd_queued; + cl_ulong cmd_submit; + cl_ulong cmd_start; + cl_ulong cmd_end; + cl_ulong cmd_complete; + + CL_CHECK(clWaitForEvents(1, &info.evt)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); + CL_CHECK(clGetEventProfilingInfo( + info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); + CL_CHECK(clReleaseEvent(info.evt)); + + char kernel_name[512]; + CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, + sizeof(kernel_name), kernel_name, NULL)); + info.kernel_name = kernel_name; + + info.cmd_queued = cmd_queued; + info.cmd_submit = cmd_submit; + info.cmd_start = cmd_start; + info.cmd_end = cmd_end; + + info.cmd_queued_duration_ns = cmd_submit - cmd_queued; + info.cmd_submit_duration_ns = cmd_start - cmd_submit; + info.cmd_duration_ns = cmd_end - cmd_start; + info.cmd_complete_duration_ns = cmd_complete - cmd_end; + info.cmd_total_duration_ns = cmd_complete - cmd_queued; + } + + // Dump a csv + float total_kernel_time = 0; + fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); + for (const ProfilingInfo & info : profiling_info) { + total_kernel_time += info.cmd_duration_ns/1.e6f; + fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", + info.op_name.c_str(), info.kernel_name.c_str(), + info.cmd_queued_duration_ns/1.e6f, + info.cmd_submit_duration_ns/1.e6f, + info.cmd_duration_ns/1.e6f, + info.cmd_complete_duration_ns/1.e6f, + info.cmd_total_duration_ns/1.e6f, + info.global_size[0], info.global_size[1], info.global_size[2], + info.local_size[0], info.local_size[1], info.local_size[2], + info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); + } + fclose(fperf); + + GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); + + // Dump a simple chrome trace + FILE* ftrace = fopen("cl_trace.json", "w"); + if (!ftrace) { + GGML_LOG_ERROR("Failed to open cl_trace.json\n"); + return; + } + + fprintf(ftrace, "[\n"); + for (const ProfilingInfo & info : profiling_info) { + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_queued/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", + info.kernel_name.c_str(), info.cmd_submit/1000); + + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_start/1000); + fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", + info.kernel_name.c_str(), info.cmd_end/1000); + } + fclose(ftrace); + } + + void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) { +#ifdef GGML_OPENCL_PROFILING + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + + profiling_info.emplace_back(); + populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor); +#else + GGML_UNUSED(tensor); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL)); +#endif + } + #ifdef GGML_OPENCL_USE_ADRENO_KERNELS // Transpose kernels cl_program program_transpose; @@ -395,47 +564,20 @@ struct ggml_backend_opencl_context { cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; #endif // GGML_OPENCL_USE_ADRENO_KERNELS + + void free() { + ref_count--; + if (ref_count == 0) { +#ifdef GGML_OPENCL_PROFILING + write_profiling_info(); +#endif + } + } }; // All registered devices with a default device in the front. static std::vector g_ggml_backend_opencl_devices; -// Profiling -#ifdef GGML_OPENCL_PROFILING -struct ProfilingInfo { - std::string op_name; - std::string kernel_name; - - cl_kernel kernel; - cl_event evt; - - cl_ulong cmd_queued; - cl_ulong cmd_submit; - cl_ulong cmd_start; - cl_ulong cmd_end; - cl_ulong overhead_start; - cl_ulong overhead_end; - // For the times below, see spec for clGetEventProfilingInfo - // The time kernel spent in cmd queue - SUBMIT - QUEUED - cl_ulong cmd_queued_duration_ns; - // The time kernel spent for submission - START - SUBMIT - cl_ulong cmd_submit_duration_ns; - // Kernel execution time in nanoseconds - END - START - cl_ulong cmd_duration_ns; - // The time for the kernel to complete - COMPLETE - END - cl_ulong cmd_complete_duration_ns; - // Total time to finish the kernel - COMPELTE - QUEUED - cl_ulong cmd_total_duration_ns; - // Global and local work sizes. - size_t global_size[3]; - size_t local_size[3]; - // Op output size. - size_t output_size[4]; -}; - -std::vector g_profiling_info; -#endif - inline std::string read_file(const std::string &path) { std::ifstream ifs(path); if (!ifs) { @@ -1669,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { backend_ctx->device = dev_ctx->device; backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; + // ref_count get increased in ggml_backend_opencl_device_init + // This function is also used to retrieve backend context, so we don't want + // to increase ref_count for each call. We only want to increase ref_count + // when the associated device is initialized + backend_ctx->ref_count = 0; + if (strstr(dev_ctx->device_name.c_str(), "Adreno") || strstr(dev_ctx->device_name.c_str(), "Qualcomm") || strstr(dev_ctx->device_version.c_str(), "Adreno")) { @@ -1841,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { return dev_ctx->backend_ctx; } -static void ggml_cl2_free(void) { -#ifdef GGML_OPENCL_PROFILING - FILE * fperf = fopen("cl_profiling.csv", "w"); - if (!fperf) { - GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); - return; +static void ggml_cl2_free(ggml_backend_t backend) { + ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context; + ctx->free(); + + // The CL context is shared by all backends, release it if all backends have been released + bool should_release_opencl = true; + for (auto device : g_ggml_backend_opencl_devices) { + ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context; + if (ctx_dev->backend_ctx->ref_count > 0) { + should_release_opencl = false; + } } - // Populate profiling info - for (ProfilingInfo & info : g_profiling_info) { - cl_ulong cmd_queued; - cl_ulong cmd_submit; - cl_ulong cmd_start; - cl_ulong cmd_end; - cl_ulong cmd_complete; - - CL_CHECK(clWaitForEvents(1, &info.evt)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL)); - CL_CHECK(clGetEventProfilingInfo( - info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL)); - CL_CHECK(clReleaseEvent(info.evt)); - - char kernel_name[512]; - CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME, - sizeof(kernel_name), kernel_name, NULL)); - info.kernel_name = kernel_name; - - info.cmd_queued = cmd_queued; - info.cmd_submit = cmd_submit; - info.cmd_start = cmd_start; - info.cmd_end = cmd_end; - - info.cmd_queued_duration_ns = cmd_submit - cmd_queued; - info.cmd_submit_duration_ns = cmd_start - cmd_submit; - info.cmd_duration_ns = cmd_end - cmd_start; - info.cmd_complete_duration_ns = cmd_complete - cmd_end; - info.cmd_total_duration_ns = cmd_complete - cmd_queued; + if (should_release_opencl) { + CL_CHECK(clReleaseContext(ctx->context)); } - - // Dump a csv - float total_kernel_time = 0; - fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n"); - for (const ProfilingInfo & info : g_profiling_info) { - total_kernel_time += info.cmd_duration_ns/1.e6f; - fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", - info.op_name.c_str(), info.kernel_name.c_str(), - info.cmd_queued_duration_ns/1.e6f, - info.cmd_submit_duration_ns/1.e6f, - info.cmd_duration_ns/1.e6f, - info.cmd_complete_duration_ns/1.e6f, - info.cmd_total_duration_ns/1.e6f, - info.global_size[0], info.global_size[1], info.global_size[2], - info.local_size[0], info.local_size[1], info.local_size[2], - info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); - } - fclose(fperf); - - GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); - - // Dump a simple chrome trace - FILE* ftrace = fopen("cl_trace.json", "w"); - if (!ftrace) { - GGML_LOG_ERROR("Failed to open cl_trace.json\n"); - return; - } - - fprintf(ftrace, "[\n"); - for (const ProfilingInfo & info : g_profiling_info) { - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", - info.kernel_name.c_str(), info.cmd_queued/1000); - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n", - info.kernel_name.c_str(), info.cmd_submit/1000); - - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", - info.kernel_name.c_str(), info.cmd_start/1000); - fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n", - info.kernel_name.c_str(), info.cmd_end/1000); - } - fclose(ftrace); -#endif } //------------------------------------------------------------------------------ @@ -2011,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) { } static void ggml_backend_opencl_free(ggml_backend_t backend) { - ggml_cl2_free(); - - GGML_UNUSED(backend); + ggml_cl2_free(backend); } static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { @@ -2899,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) { ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev); + // Getting a new reference to the backend, increase ref_count + backend_ctx->ref_count++; ggml_backend_t backend = new ggml_backend { /* .guid = */ ggml_backend_opencl_guid(), @@ -3159,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso #define dump_tensor(tensor) #endif -//------------------------------------------------------------------------------ -// Profiling utility -//------------------------------------------------------------------------------ -#ifdef GGML_OPENCL_PROFILING -static void populateProfilingInfo( - ProfilingInfo& info, cl_event evt, cl_kernel kernel, - size_t global_size[3], size_t local_size[3], - const ggml_tensor * tensor) { - info.op_name = tensor->name; - info.kernel = kernel; - info.evt = evt; - - info.local_size[0] = local_size[0]; - info.local_size[1] = local_size[1]; - info.local_size[2] = local_size[2]; - info.global_size[0] = global_size[0]; - info.global_size[1] = global_size[1]; - info.global_size[2] = global_size[2]; - info.output_size[0] = tensor->ne[0]; - info.output_size[1] = tensor->ne[1]; - info.output_size[2] = tensor->ne[2]; - info.output_size[3] = tensor->ne[3]; -} -#endif - //------------------------------------------------------------------------------ // Ops //------------------------------------------------------------------------------ @@ -3227,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c const cl_ulong nb2 = dst ? dst->nb[2] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3271,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1}; size_t local_work_size[] = {1, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3321,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst ? dst->nb[3] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3396,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3461,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst ? dst->nb[3] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3536,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3598,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3661,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3723,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const const cl_ulong nb3 = dst->nb[3]; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -3786,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { unsigned int nth = MIN(64, ne0); size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -3821,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3848,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3868,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3895,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3915,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3947,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3967,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -3992,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4012,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4044,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4064,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4096,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4116,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4157,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4177,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; //ggml_backend_opencl_device_context * dev_ctx = // (ggml_backend_opencl_device_context *)backend->device->context; @@ -4241,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c // This is local memory - the size depends on subgroup size. CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL)); -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4261,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4300,15 +4202,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1}; size_t local_work_size[] = {(size_t)sgs, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4320,7 +4214,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const UNUSED(src1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -4397,16 +4290,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const } if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return; - -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) { @@ -4419,7 +4303,6 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con UNUSED(src1_shape_def); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; if (backend_ctx->kernel_repeat == nullptr) { GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__); @@ -4467,15 +4350,7 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = { gws0, gws1, gws2 }; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { @@ -4488,7 +4363,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; if (backend_ctx->kernel_pad == nullptr) { GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__); @@ -4533,15 +4407,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t local_work_size_ptr = nullptr; } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) { @@ -4553,7 +4419,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg GGML_ASSERT(dst->type == GGML_TYPE_F32); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0); cl_kernel kernel = nullptr; @@ -4644,17 +4509,7 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg local_work_size_ptr = nullptr; } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - size_t profiling_gws[3] = {global_work_size[0], 1, 1}; - size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1}; - populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4732,7 +4587,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con global_work_size[1] = d_ne1; global_work_size[2] = d_ne2; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL)); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } } } else { @@ -4782,7 +4637,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con d_ne2 > 0 ? (size_t)d_ne2 : 1, d_ne3 > 0 ? (size_t)d_ne3 : 1 }; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL)); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst); } } @@ -4795,7 +4650,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor GGML_ASSERT(dst->type == GGML_TYPE_F32); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; if (backend_ctx->kernel_timestep_embedding == nullptr) { GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__); @@ -4828,17 +4682,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor size_t global_work_size[] = {gws0, gws1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem - - g_profiling_info.emplace_back(); - size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1}; - size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS - populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst); } static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -4853,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -5058,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co static_cast(padded_height_B) }; - #ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst); - #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL)); - #endif + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst); } else { // no need to transpose B in other cases // create an image for B from sub_buffer @@ -5188,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co // enqueue kernel with profiling // <--------------------------------------------> // - #ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); - // enqueue kernel without profiling - #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); - #endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); // <--------------------------------------------> // // deallocate sub buffers and images @@ -5277,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co global_work_size[2] = (size_t)ne12*ne13; } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); return; } #else // GGML_OPENCL_SOA_Q @@ -5515,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else if (src0t == GGML_TYPE_Q4_K) { GGML_ASSERT(false && "not implemented"); } else if (src0t == GGML_TYPE_Q3_K) { @@ -5534,30 +5344,14 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { int64_t ny = (ne11 + nrows - 1)/nrows; size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } } @@ -5574,7 +5368,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, GGML_ASSERT(src2->extra); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra; @@ -5680,15 +5473,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123}; size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5701,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; float scale; memcpy(&scale, dst->op_params, sizeof(scale)); @@ -5730,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5775,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -5840,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1); } static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -5871,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr const int ne02 = src0 ? src0->ne[2] : 0; ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -5895,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; size_t local_work_size[] = {64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { kernel = backend_ctx->kernel_diag_mask_inf; @@ -5923,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. } -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst); } } @@ -5951,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c } ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -6031,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6051,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const GGML_ASSERT(dst->extra); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; @@ -6217,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6240,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -6309,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC}; size_t local_work_size[] = {256, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6332,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -6364,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1}; size_t local_work_size[] = {(size_t)ne00_padded, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -6386,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c GGML_ASSERT(ggml_is_contiguous(src0)); ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; - cl_command_queue queue = backend_ctx->queue; ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; @@ -6427,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)64, 1, 1}; -#ifdef GGML_OPENCL_PROFILING - cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); - - g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); -#else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); -#endif + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } //------------------------------------------------------------------------------ From 2bf9d539dd158345e3a3b096e16474af535265b4 Mon Sep 17 00:00:00 2001 From: Anton Mitkov Date: Wed, 25 Jun 2025 17:09:55 +0100 Subject: [PATCH 09/54] sycl: GGML_SYCL_DISABLE_OPT on by default for all Intel Devices (#13973) --- docs/backend/SYCL.md | 2 +- ggml/src/ggml-sycl/common.hpp | 25 +------------------------ ggml/src/ggml-sycl/ggml-sycl.cpp | 6 ++---- ggml/src/ggml-sycl/sycl_hw.cpp | 4 +++- ggml/src/ggml-sycl/sycl_hw.hpp | 3 +++ 5 files changed, 10 insertions(+), 30 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 249e73451..6e9b88935 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -757,7 +757,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | Name | Value | Function | |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | -| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase | +| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for intel devices older than Gen 10) | | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. | | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 753b4af14..4e7449d06 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -199,7 +199,7 @@ struct sycl_device_info { // size_t smpb; // max. shared memory per block bool vmm; // virtual memory support size_t total_vram; - sycl_hw_info hw_info; + //sycl_hw_info hw_info; \\ device id and aarch, currently not used optimize_feature opt_feature; }; @@ -286,29 +286,6 @@ struct ggml_tensor_extra_gpu { void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); -inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { - optimize_feature opt; - - opt.reorder = - (arch == syclex::architecture::intel_gpu_dg1 || - arch == syclex::architecture::intel_gpu_acm_g10 || - arch == syclex::architecture::intel_gpu_acm_g11 || - arch == syclex::architecture::intel_gpu_acm_g12 || - arch == syclex::architecture::intel_gpu_pvc || - arch == syclex::architecture::intel_gpu_pvc_vg || - arch == syclex::architecture::intel_gpu_mtl_u || - arch == syclex::architecture::intel_gpu_mtl_s || - arch == syclex::architecture::intel_gpu_mtl_h || - arch == syclex::architecture::intel_gpu_arl_u || - arch == syclex::architecture::intel_gpu_arl_s || - arch == syclex::architecture::intel_gpu_arl_h || - arch == syclex::architecture::intel_gpu_bmg_g21 || - arch == syclex::architecture::intel_gpu_lnl_m - ); - - return opt; -} - namespace sycl_ex = sycl::ext::oneapi::experimental; struct ggml_backend_sycl_context { int device; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f25a96a62..9cb36ae99 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -83,9 +83,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.devices[i].cc = 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - info.devices[i].hw_info = get_device_hw_info(&device); - info.devices[i].opt_feature = check_gpu_optimize_feature(info.devices[i].hw_info.arch); - + info.devices[i].opt_feature.reorder = !device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); info.max_work_group_sizes[i] = prop.get_max_work_group_size(); } @@ -195,7 +193,7 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1); + g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); diff --git a/ggml/src/ggml-sycl/sycl_hw.cpp b/ggml/src/ggml-sycl/sycl_hw.cpp index da121ffc2..704114003 100644 --- a/ggml/src/ggml-sycl/sycl_hw.cpp +++ b/ggml/src/ggml-sycl/sycl_hw.cpp @@ -1,6 +1,7 @@ #include "sycl_hw.hpp" - +// TODO: currently not used +/* sycl_hw_info get_device_hw_info(sycl::device *device_ptr) { sycl_hw_info res; int32_t id = device_ptr->get_info(); @@ -11,3 +12,4 @@ sycl_hw_info get_device_hw_info(sycl::device *device_ptr) { return res; } +*/ diff --git a/ggml/src/ggml-sycl/sycl_hw.hpp b/ggml/src/ggml-sycl/sycl_hw.hpp index bf689450c..36b140bf0 100644 --- a/ggml/src/ggml-sycl/sycl_hw.hpp +++ b/ggml/src/ggml-sycl/sycl_hw.hpp @@ -10,6 +10,8 @@ namespace syclex = sycl::ext::oneapi::experimental; +// TODO: currently not used +/* struct sycl_hw_info { syclex::architecture arch; int32_t device_id; @@ -18,6 +20,7 @@ struct sycl_hw_info { bool is_in_vector(std::vector &vec, int item); sycl_hw_info get_device_hw_info(sycl::device *device_ptr); +*/ #endif // SYCL_HW_HPP From b193d5306912a2adae0fde7481819f6ee0941bc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 25 Jun 2025 23:26:51 +0200 Subject: [PATCH 10/54] ggml : do not output unprintable characters on GGUF load failure (#14381) --- ggml/src/gguf.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index a0a318a29..5ffd12b8b 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -335,7 +335,11 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par for (uint32_t i = 0; i < magic.size(); i++) { if (magic[i] != GGUF_MAGIC[i]) { - GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, magic[0], magic[1], magic[2], magic[3]); + char c0 = isprint(magic[0]) ? magic[0] : '?'; + char c1 = isprint(magic[1]) ? magic[1] : '?'; + char c2 = isprint(magic[2]) ? magic[2] : '?'; + char c3 = isprint(magic[3]) ? magic[3] : '?'; + GGML_LOG_ERROR("%s: invalid magic characters: '%c%c%c%c', expected 'GGUF'\n", __func__, c0, c1, c2, c3); gguf_free(ctx); return nullptr; } From 60ef23d6c14d325d83eae5752e5de39ad268e9b0 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 26 Jun 2025 05:49:04 +0800 Subject: [PATCH 11/54] ggml-cpu: enable IBM NNPA Vector Intrinsics (#14317) * ggml-cpu: add nnpa compile flag Signed-off-by: Aaron Teo (cherry picked from commit 4a9f60c201573128f73a65999b3e5cc497fae5c1) * ggml-cpu: add fp16->fp32 nnpa first Signed-off-by: Aaron Teo (cherry picked from commit 8d4a7987f9c1887f716be96250f2caeee0253929) * ggml-cpu: add fp32->fp16 Signed-off-by: Aaron Teo (cherry picked from commit 0ff0d6516247a41d2ade42b42cf0d676a4dd1627) * ggml-cpu: better variable names Signed-off-by: Aaron Teo (cherry picked from commit 2f58bbcbb89c183340e252362b2a40651f573f1f) * docs: update s390x docs Signed-off-by: Aaron Teo (cherry picked from commit 01b929491b50071a5d0572235dcf5a449da70aa7) * ggml-cpu: add debugging prints to see if dlf16 is correct Signed-off-by: Aaron Teo * ggml-cpu: fix print vs printf Signed-off-by: Aaron Teo * ggml-cpu: fix float placeholder Signed-off-by: Aaron Teo * ggml-cpu: ensure fp16 and fp32 load and stores are called Signed-off-by: Aaron Teo * ggml-cpu: fp16 load ensured to hit Signed-off-by: Aaron Teo * ggml-cpu: remove sigint from fp16 store for some reason, the function is not getting a hit when debugged with gdb. we will need to investigate further Signed-off-by: Aaron Teo * ggml-cpu: activate nnpa for ggml_cpu_fp16_to_fp32 Signed-off-by: Aaron Teo * ggml-cpu: nnpa activate ggml_cpu_fp16_to_fp32 for 8 elements Signed-off-by: Aaron Teo * ggml-cpu: nnpa switch to vec_xst test Signed-off-by: Aaron Teo * ggml-cpu: switch to vec_xst for 4 element loops also Signed-off-by: Aaron Teo * ggml-cpu: rework noop Signed-off-by: Aaron Teo * ggml-cpu: remove noop, general code cleanup Signed-off-by: Aaron Teo * ggml-cpu: clarify variable naming Signed-off-by: Aaron Teo * ggml-cpu: activate nnpa for ggml_cpu_fp32_to_fp16 Signed-off-by: Aaron Teo * ggml-cpu: add breakpoint for debugging Signed-off-by: Aaron Teo * ggml-cpu: test fix for conversion failure Signed-off-by: Aaron Teo * ggml-cpu: disable fp32->fp16 nnpa conversions for now there are some conversion failures in nnpa that requires the eyes of an ibm stsm. will create a separate pr to introduce the fp32->fp16 change. Signed-off-by: Aaron Teo * ggml-cpu: switch to elif macro Signed-off-by: Aaron Teo * ggml-cpu: reattempt fp32->fp16 Signed-off-by: Aaron Teo * ggml-cpu: fix typo Signed-off-by: Aaron Teo * ggml-cpu: reattempt fp32->fp16 Signed-off-by: Aaron Teo * ggml-cpu: fix compiler types Signed-off-by: Aaron Teo * ggml-cpu: change to typedef vector types Signed-off-by: Aaron Teo * ggml-cpu: add 4 element loops for fp32->fp16 Signed-off-by: Aaron Teo * ggml-cpu: clarified vector naming Signed-off-by: Aaron Teo * ggml-cpu: bring back fp32->fp16 store nnpa Signed-off-by: Aaron Teo * ggml-cpu: activate nnpa fp32->fp16 or fp16->fp32 compute Signed-off-by: Aaron Teo * ggml-cpu: add nnpa macro check in ggml-impl Signed-off-by: Aaron Teo * ggml-cpu: add missing __func__ Signed-off-by: Aaron Teo * ggml-cpu: diagnose why __NNPA__ macro is not being defined Signed-off-by: Aaron Teo * ggml-cpu: import vecintrin.h to fix compiler errors Signed-off-by: Aaron Teo * ggml-cpu: update macro tests Signed-off-by: Aaron Teo * ggml-cpu: move s390x typedef to own header file Signed-off-by: Aaron Teo * Revert "ggml-cpu: move s390x typedef to own header file" This reverts commit 157f856c34589566151630e294563a420702db39. Signed-off-by: Aaron Teo * ggml-cpu: switch to importing ggml-cpu-impl instead Signed-off-by: Aaron Teo * ggml-cpu: fix macro declaration Signed-off-by: Aaron Teo * ggml-cpu: test more macros Signed-off-by: Aaron Teo * ggml-cpu: add debug prints Signed-off-by: Aaron Teo * ggml-cpu: bruteforce macro definitions Signed-off-by: Aaron Teo * ggml-cpu: move macro definitions Signed-off-by: Aaron Teo * ggml-cpu: add ggml-impl.h to cmakelists Signed-off-by: Aaron Teo * ggml-cpu: switch to private macros Signed-off-by: Aaron Teo * ggml-cpu: move s390x typedef to own header file Signed-off-by: Aaron Teo (cherry picked from commit 157f856c34589566151630e294563a420702db39) * ggml-cpu: move things around Signed-off-by: Aaron Teo * ggml-cpu: bring back compile macros Signed-off-by: Aaron Teo * ggml-cpu: switch to quotes for import Signed-off-by: Aaron Teo * ggml-cpu: add compiler error macro Signed-off-by: Aaron Teo * ggml-cpu: add s390x detection in ggml-src Signed-off-by: Aaron Teo * ggml-cpu: bring back compile definitions Signed-off-by: Aaron Teo * ggml-cpu: undo cmakelists work Signed-off-by: Aaron Teo * Revert "ggml-cpu: move s390x typedef to own header file" This reverts commit 18d79e1a30b39d9aaa0bd58400c5cf2c32135c9a. Signed-off-by: Aaron Teo * ggml-cpu: remove typedefs.h Signed-off-by: Aaron Teo * ggml-cpu: remove typedef from cmakelists Signed-off-by: Aaron Teo * ggml-cpu: add ggml-impl.h future notes Signed-off-by: Aaron Teo * ggml-cpu: add todo comment for future reference Signed-off-by: Aaron Teo * ggml-cpu: clarify naming of dlf16 Signed-off-by: Aaron Teo * ggml-cpu: remove unnecessary target compile definitions Signed-off-by: Aaron Teo * ggml-cpu: move nnpa fp16->fp32 and fp32->fp16 to simd-mappings Signed-off-by: Aaron Teo * ggml: refactor fp32->fp16 and fp16->fp32 simd to ggml-cpu Signed-off-by: Aaron Teo * docs: update broken huggingface link for s390x Signed-off-by: Aaron Teo * ggml-cpu: fix duplicate func names during compile Signed-off-by: Aaron Teo * Revert "ggml-cpu: fix duplicate func names during compile" This reverts commit fbb733451f27677063b914d4f6c9a9841d45b38d. Signed-off-by: Aaron Teo * Revert "ggml: refactor fp32->fp16 and fp16->fp32 simd to ggml-cpu" This reverts commit bd288e8fa52b5244f65cee21cb61062f1a9e0ca5. Signed-off-by: Aaron Teo * ggml: refactor fp16<->fp32 simd to ggml-cpu Signed-off-by: Aaron Teo * ggml-cpu: fix missing simd-mappings.h import in quants.c Signed-off-by: Aaron Teo * ggml-cpu: fix missing simd-mappings.h within repack Signed-off-by: Aaron Teo * ggml-cpu: fix amx mmq missing simd-mappings.h Signed-off-by: Aaron Teo * ggml-cpu: attempt at fixing loongarch failing build Signed-off-by: Aaron Teo * ggml-cpu: move nnpa together with other fp16<->fp32 simd Signed-off-by: Aaron Teo * ggml-cpu: fix wrong refactor of ggml-base ref: https://github.com/ggml-org/llama.cpp/pull/14317#discussion_r2164176555 Signed-off-by: Aaron Teo * ggml: remove dependency on ggml-cpu from ggml-base Signed-off-by: Aaron Teo * ggml-cpu: rename all fp16<->fp32 macros to prefix with ggml_cpu ref: https://github.com/ggml-org/llama.cpp/pull/14317#discussion_r2164449406 Signed-off-by: Aaron Teo * ggml-cpu: remove mistaken fallback macro fallback logic was already implemented but i was too sleepy to realise Signed-off-by: Aaron Teo * ggml: move ggml_table_f32_f16 to ggml-cpu ref: https://github.com/ggml-org/llama.cpp/pull/14317#discussion_r2164775006 Signed-off-by: Aaron Teo * ggml-cpu: move ggml_table_f32_f16 back to ggml-base due to ci failures Signed-off-by: Aaron Teo * Revert "ggml-cpu: move ggml_table_f32_f16 back to ggml-base due to ci failures" This reverts commit 32a3533564bdb7902cefb9c89b1c9e956a81ce29. Signed-off-by: Aaron Teo * Revert "ggml: move ggml_table_f32_f16 to ggml-cpu" This reverts commit 9e40d984ad27d7b60392fb2b7548885201864fe4. Signed-off-by: Aaron Teo * ggml: move ggml_table_f32_f16 to ggml-cpu ref: https://github.com/ggml-org/llama.cpp/pull/14317#discussion_r2164775006 Signed-off-by: Aaron Teo (cherry picked from commit 9e40d984ad27d7b60392fb2b7548885201864fe4) * ggml: move ggml_table_f32_f16 to ggml-cpu.c Signed-off-by: Aaron Teo * ggml-cpu: extern c ggml_table_f32_f16 + chore docs Signed-off-by: Aaron Teo * ggml-cpu: dedup ggml_table_f32_f16 from simd-mappings.h we rely on the variable declaration in ggml-cpu.c instead Signed-off-by: Aaron Teo * Revert "ggml-cpu: dedup ggml_table_f32_f16 from simd-mappings.h" This reverts commit f71b21d2f74f5e03ec0c2b4fefd3cbf395aecf16. Signed-off-by: Aaron Teo * ggml-cpu: bring back ggml_table_f32_f16 Signed-off-by: Aaron Teo * Revert "ggml-cpu: bring back ggml_table_f32_f16" This reverts commit 2dce119178bed5ef5c8398c4230ddd14fef80e49. Signed-off-by: Aaron Teo * fix ggml time initialization * fix f32_f16 table init * remove extra line --------- Signed-off-by: Aaron Teo Co-authored-by: slaren --- docs/build-s390x.md | 41 +++- docs/build.md | 4 + ggml/CMakeLists.txt | 1 + ggml/include/ggml-cpu.h | 1 + ggml/src/ggml-cpu/CMakeLists.txt | 8 + ggml/src/ggml-cpu/amx/mmq.cpp | 19 +- ggml/src/ggml-cpu/arch/arm/quants.c | 217 +++++++++--------- ggml/src/ggml-cpu/arch/arm/repack.cpp | 25 ++- ggml/src/ggml-cpu/arch/loongarch/quants.c | 105 ++++----- ggml/src/ggml-cpu/arch/powerpc/quants.c | 111 ++++----- ggml/src/ggml-cpu/arch/riscv/quants.c | 83 +++---- ggml/src/ggml-cpu/arch/riscv/repack.cpp | 47 ++-- ggml/src/ggml-cpu/arch/s390/quants.c | 57 ++--- ggml/src/ggml-cpu/arch/wasm/quants.c | 59 ++--- ggml/src/ggml-cpu/arch/x86/quants.c | 165 +++++++------- ggml/src/ggml-cpu/arch/x86/repack.cpp | 39 ++-- ggml/src/ggml-cpu/common.h | 5 +- ggml/src/ggml-cpu/ggml-cpu-impl.h | 12 +- ggml/src/ggml-cpu/ggml-cpu.c | 75 +++++-- ggml/src/ggml-cpu/ggml-cpu.cpp | 3 + ggml/src/ggml-cpu/llamafile/sgemm.cpp | 5 +- ggml/src/ggml-cpu/ops.cpp | 96 ++++---- ggml/src/ggml-cpu/quants.c | 49 ++-- ggml/src/ggml-cpu/repack.cpp | 29 +-- ggml/src/ggml-cpu/simd-mappings.h | 244 +++++++++++++++++--- ggml/src/ggml-cpu/vec.cpp | 4 +- ggml/src/ggml-cpu/vec.h | 90 ++++---- ggml/src/ggml-impl.h | 262 ++++++---------------- ggml/src/ggml.c | 11 - 29 files changed, 1005 insertions(+), 862 deletions(-) diff --git a/docs/build-s390x.md b/docs/build-s390x.md index f44038c58..bb6eae784 100644 --- a/docs/build-s390x.md +++ b/docs/build-s390x.md @@ -28,8 +28,9 @@ cmake --build build --config Release -j $(nproc) ``` **Notes**: -- For faster repeated compilation, install [ccache](https://ccache.dev/) -- By default, VXE/VXE2 is enabled. To disable it (not recommended): + +- For faster repeated compilation, install [ccache](https://ccache.dev/) +- By default, VXE/VXE2 is enabled. To disable it (not recommended): ```bash cmake -S . -B build \ @@ -41,18 +42,29 @@ cmake --build build --config Release -j $(nproc) cmake --build build --config Release -j $(nproc) ``` -- For debug builds: +- By default, NNPA is enabled when available. To disable it (not recommended): + + ```bash + cmake -S . -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_BLAS=ON \ + -DGGML_BLAS_VENDOR=OpenBLAS \ + -DGGML_NNPA=OFF + + cmake --build build --config Release -j $(nproc) + ``` + +- For debug builds: ```bash cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DGGML_BLAS=ON \ -DGGML_BLAS_VENDOR=OpenBLAS - cmake --build build --config Debug -j $(nproc) ``` -- For static builds, add `-DBUILD_SHARED_LIBS=OFF`: +- For static builds, add `-DBUILD_SHARED_LIBS=OFF`: ```bash cmake -S . -B build \ @@ -70,7 +82,7 @@ All models need to be converted to Big-Endian. You can achieve this in three cas 1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)** - You can find popular models pre-converted and verified at [s390x Ready Models](hf.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08). + You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08). These models and their respective tokenizers are verified to run correctly on IBM Z & LinuxONE. @@ -101,27 +113,33 @@ All models need to be converted to Big-Endian. You can achieve this in three cas ``` For example, + ```bash python3 gguf-py/gguf/scripts/gguf_convert_endian.py granite-3.3-2b-instruct-le.f16.gguf BIG mv granite-3.3-2b-instruct-le.f16.gguf granite-3.3-2b-instruct-be.f16.gguf ``` **Notes:** + - The GGUF endian conversion script may not support all data types at the moment and may fail for some models/quantizations. When that happens, please try manually converting the safetensors model to GGUF Big-Endian via Step 2. ## IBM Accelerators ### 1. SIMD Acceleration -Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14 or EC13. In such systems, the APIs can still run but will use a scalar implementation. +Only available in IBM z15 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation. -### 2. zDNN Accelerator +### 2. NNPA Vector Intrinsics Acceleration -*Only available in IBM z16 or later system. No direction at the moment.* +Only available in IBM z16 or later system with the `-DGGML_NNPA=ON` (turned on when available) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation. -### 3. Spyre Accelerator +### 3. zDNN Accelerator -*No direction at the moment.* +_Only available in IBM z16 or later system. No direction at the moment._ + +### 4. Spyre Accelerator + +_No direction at the moment._ ## Performance Tuning @@ -154,4 +172,3 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl 2. **Other Questions** Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com). - diff --git a/docs/build.md b/docs/build.md index 20a6f606e..2e0b5d970 100644 --- a/docs/build.md +++ b/docs/build.md @@ -557,6 +557,10 @@ ninja To read documentation for how to build on Android, [click here](./android.md) +## IBM Z & LinuxONE + +To read documentation for how to build on IBM Z & LinuxONE, [click here](./build-s390x.md) + ## Notes about GPU-accelerated backends The GPU may still be used to accelerate some parts of the computation even when using the `-ngl 0` option. You can fully disable GPU acceleration by using `--device none`. diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4e7399f9e..215eb2348 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON) option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF) option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF) option(GGML_VXE "ggml: enable vxe" ON) +option(GGML_NNPA "ggml: enable nnpa" ON) option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index de77a875e..e3b79d09b 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -101,6 +101,7 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_vxe (void); + GGML_BACKEND_API int ggml_cpu_has_nnpa (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); GGML_BACKEND_API int ggml_cpu_has_llamafile (void); diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 71b1d67b8..671fad4d2 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -448,6 +448,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) # TODO: Separation to determine activation of VX/VXE/VXE2 if (${S390X_M} MATCHES "8561|8562") + set(GGML_NNPA OFF) message(STATUS "z15 target") list(APPEND ARCH_FLAGS -march=z15) elseif (${S390X_M} MATCHES "3931") @@ -464,7 +465,14 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() if (GGML_VXE) + message(STATUS "VX/VXE/VXE2 enabled") list(APPEND ARCH_FLAGS -mvx -mzvector) + list(APPEND ARCH_DEFINITIONS GGML_VXE) + endif() + + if (GGML_NNPA) + message(STATUS "NNPA enabled") + list(APPEND ARCH_DEFINITIONS GGML_NNPA) endif() elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm") message(STATUS "Wasm detected") diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index cec34eb64..47c61b881 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -8,6 +8,7 @@ #include "mmq.h" #include "ggml-impl.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "quants.h" #include "ggml-quants.h" #include @@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_ // Quantize these floats const float iscale = 127.f / amax; - y[i].d = GGML_FP32_TO_FP16(1 / iscale); + y[i].d = GGML_CPU_FP32_TO_FP16(1 / iscale); const float id = ( amax != 0.0f ) ? iscale : 0.f; const __m512 vscale = _mm512_set1_ps(id); @@ -1090,7 +1091,7 @@ struct acc_C { const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1113,8 +1114,8 @@ struct acc_C { const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(ggml_half)))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); - const __m512 vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].s)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); + const __m512 vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].s)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1137,7 +1138,7 @@ struct acc_C { const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset))); for (int m = 0; m < nr; ++m) { - const __m512 vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[m * lda].d)); + const __m512 vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[m * lda].d)); const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N)); __m512 vsum; @@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni for (int k = 0; k < 8; ++k) { va[k] = _mm512_set1_epi32(a_ptr[k]); } - vd1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].d)); - vs1 = _mm512_set1_ps(GGML_FP16_TO_FP32(A[0 * KB + i].s)); + vd1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d)); + vs1 = _mm512_set1_ps(GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s)); } // load b @@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnniqs + 16); float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) }; float32x4_t scale = vld1q_f32(_scale); @@ -274,10 +275,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // dot product sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4, svdot_s32(svdup_n_s32(0), qx0ls, qy0l), - svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4, svdot_s32(svdup_n_s32(0), qx1ls, qy1l), - svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); @@ -313,9 +314,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // dot product sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); @@ -354,9 +355,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // dot product sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32, - svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32, - svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1)); @@ -404,8 +405,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -423,7 +424,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -464,10 +465,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const block_q8_1 * GGML_RESTRICT b_y1 = &vy1[i]; float32_t summs_t[4] = { - GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s), - GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s), - GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s), - GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s) + GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y0->s), + GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y0->s), + GGML_CPU_FP16_TO_FP32(b_x0->m) * GGML_CPU_FP16_TO_FP32(b_y1->s), + GGML_CPU_FP16_TO_FP32(b_x1->m) * GGML_CPU_FP16_TO_FP32(b_y1->s) }; summs0 = vaddq_f32(summs0, vld1q_f32(summs_t)); @@ -490,10 +491,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // mmla into int32x4_t float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) }; float32x4_t scale = vld1q_f32(_scale); @@ -539,7 +540,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; - summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s); + summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s) + GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -562,8 +563,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; @@ -582,7 +583,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -666,10 +667,10 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -694,7 +695,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -739,8 +740,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8x16_t m4b = vdupq_n_u8(0x0F); - summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s); - summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s); + summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); // extract the 5th bit via lookup table ((b) << 4) memcpy(&qh0, x0->qh, sizeof(qh0)); @@ -784,10 +785,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; @@ -812,7 +813,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -864,10 +865,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); float32_t _scale[4] = { - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x0->d)*GGML_CPU_FP16_TO_FP32(b_y1->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y0->d), + GGML_CPU_FP16_TO_FP32(b_x1->d)*GGML_CPU_FP16_TO_FP32(b_y1->d) }; float32x4_t scale = vld1q_f32(_scale); @@ -934,10 +935,10 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16, svdot_s32(svdup_n_s32(0), qx0_0, qy0_0), - svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16, svdot_s32(svdup_n_s32(0), qx1_0, qy1_0), - svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1)); @@ -960,9 +961,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), - svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); @@ -1002,8 +1003,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64); // scale creation - const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d); - const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d); + const float32_t deq1 = GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d); + const float32_t deq2 = GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d); // duplicate deq1 in first half of vector and deq2 in second half of vector const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2); @@ -1043,11 +1044,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), - ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); + ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_CPU_FP16_TO_FP32(x0->d)*GGML_CPU_FP16_TO_FP32(y0->d)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), - ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); + ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_CPU_FP16_TO_FP32(x1->d)*GGML_CPU_FP16_TO_FP32(y1->d)); } sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); @@ -1059,7 +1060,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -1217,7 +1218,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const int16x8_t ysum0 = vld1q_s16(y[i].bsums); const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; #if defined(__ARM_FEATURE_DOTPROD) sumi0 = vaddq_s32(sumi0, sumi1); @@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } - sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); } *s = sumf; @@ -1362,7 +1363,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const int16x8_t ysum0 = vld1q_s16(y[i].bsums); const int16x8_t ysum1 = vld1q_s16(y[i].bsums + 8); - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; #if defined(__ARM_FEATURE_DOTPROD) sumi0 = vaddq_s32(sumi0, sumi1); @@ -1393,7 +1394,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); sumf += (float) sumi * d; } @@ -1425,9 +1426,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi switch (vector_length) { case 128: for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); svfloat32_t d_broad = svdup_n_f32((float32_t)d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; @@ -1570,9 +1571,9 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi case 256: case 512: for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); svfloat32_t d_broad = svdup_n_f32((float32_t)d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); svfloat32_t dmin_broad = svdup_n_f32((float32_t)dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; @@ -1671,8 +1672,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sum = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1742,8 +1743,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -1805,7 +1806,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3_sv = x[i].qs; const uint8_t * GGML_RESTRICT qh_sv = x[i].hmask; @@ -1981,7 +1982,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].hmask; @@ -2112,7 +2113,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2258,18 +2259,18 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi bias[3] = vaddvq_s32(vaddq_s32(vmull_s16(vget_low_s16(y1_sums), vget_low_s16(x1_mins)), vmull_s16(vget_high_s16(y1_sums), vget_high_s16(x1_mins)))); const float32x4_t dmins = { - GGML_FP16_TO_FP32(x0->dmin) * y0->d, - GGML_FP16_TO_FP32(x0->dmin) * y1->d, - GGML_FP16_TO_FP32(x1->dmin) * y0->d, - GGML_FP16_TO_FP32(x1->dmin) * y1->d, + GGML_CPU_FP16_TO_FP32(x0->dmin) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->dmin) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->dmin) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->dmin) * y1->d, }; vfsum = vmlsq_f32(vfsum, vcvtq_f32_s32(vld1q_s32(bias)), dmins); const float32x4_t superblock_scale = { - GGML_FP16_TO_FP32(x0->d) * y0->d, - GGML_FP16_TO_FP32(x0->d) * y1->d, - GGML_FP16_TO_FP32(x1->d) * y0->d, - GGML_FP16_TO_FP32(x1->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y1->d, }; vfsum = vmlaq_f32(vfsum, vcvtq_f32_s32(visum), superblock_scale); } @@ -2289,8 +2290,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -2377,8 +2378,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -2478,9 +2479,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2520,8 +2521,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8)); @@ -2630,9 +2631,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2827,10 +2828,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t vibias = vmulq_n_s32(vld1q_s32(bias), 32); const float32x4_t superblock_scale = { - GGML_FP16_TO_FP32(x0->d) * y0->d, - GGML_FP16_TO_FP32(x0->d) * y1->d, - GGML_FP16_TO_FP32(x1->d) * y0->d, - GGML_FP16_TO_FP32(x1->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x0->d) * y1->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y0->d, + GGML_CPU_FP16_TO_FP32(x1->d) * y1->d, }; visum = vsubq_s32(visum, vibias); @@ -2858,7 +2859,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi svuint8_t q6h_1, q6h_2, q6h_3, q6h_4; for (int i = 0; i < nb; ++i) { - const float d_all = GGML_FP16_TO_FP32(x[i].d); + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q6 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -3011,7 +3012,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d_all = GGML_FP16_TO_FP32(x[i].d); + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q6 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -3128,7 +3129,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -3199,7 +3200,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; float sumf1 = 0, sumf2 = 0; @@ -3234,7 +3235,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -3284,7 +3285,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint8x8_t scales8 = vld1_u8(x[i].scales); @@ -3329,7 +3330,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3398,7 +3399,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -3458,7 +3459,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -3521,7 +3522,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3557,7 +3558,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3630,7 +3631,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; @@ -3691,7 +3692,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -3786,7 +3787,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } - sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3); + sumf += y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2 + IQ1S_DELTA * sumi3); } *s = sumf; @@ -3817,7 +3818,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -3905,7 +3906,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } - sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); + sumf += y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); } *s = sumf; @@ -3952,7 +3953,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qh += 2; } - sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); } *s = sumf; @@ -4003,13 +4004,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]); sumf += - GGML_FP16_TO_FP32(x[ib+0].d) * GGML_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) + - GGML_FP16_TO_FP32(x[ib+1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2); + GGML_CPU_FP16_TO_FP32(x[ib+0].d) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) + + GGML_CPU_FP16_TO_FP32(x[ib+1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2); } #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -4071,7 +4072,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } - sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); + sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); } *s = sumf; @@ -4079,7 +4080,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 39a0dd301..2f8bc9e25 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include @@ -51,7 +52,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < 8; j++) { @@ -102,7 +103,7 @@ void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -145,7 +146,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < 4; j++) { @@ -221,7 +222,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -311,7 +312,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -399,7 +400,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -514,7 +515,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -608,7 +609,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -1117,7 +1118,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -1570,7 +1571,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -2039,7 +2040,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -2147,7 +2148,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } diff --git a/ggml/src/ggml-cpu/arch/loongarch/quants.c b/ggml/src/ggml-cpu/arch/loongarch/quants.c index f2ea96572..9e33fb322 100644 --- a/ggml/src/ggml-cpu/arch/loongarch/quants.c +++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -474,7 +475,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Quantize these floats const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = (__m256)__lasx_xvreplfr2vr_s( id ); @@ -548,7 +549,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Quantize these floats const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = __lasx_xvreplfr2vr_s( id ); @@ -576,7 +577,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Compute the sum of the quants and set y[i].s const __m128i s0 = __lsx_vadd_w(__lsx_vadd_w(ni0, ni1), __lsx_vadd_w(ni2, ni3)); const __m128i s1 = __lsx_vadd_w(__lsx_vadd_w(ni4, ni5), __lsx_vadd_w(ni6, ni7)); - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1))); + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(__lsx_vadd_w(s0, s1))); // Convert int32 to int16 ni0 = lsx_packs_w( ni0, ni1 ); @@ -667,7 +668,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = __lasx_xvreplfr2vr_s( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + const __m256 d = __lasx_xvreplfr2vr_s( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); __m256i qx = bytes_from_nibbles_32(x[ib].qs); @@ -699,7 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi for (; ib + 1 < nb; ib += 2) { // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0); @@ -717,7 +718,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) ); + const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) ); const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0); @@ -766,7 +767,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -797,10 +798,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const float d0 = GGML_FP16_TO_FP32(x[ib].d); - const float d1 = GGML_FP16_TO_FP32(y[ib].d); + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); const __m256 d0v = __lasx_xvreplfr2vr_s( d0 ); const __m256 d1v = __lasx_xvreplfr2vr_s( d1 ); @@ -834,7 +835,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -865,7 +866,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); //FIXME + const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); //FIXME __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -902,7 +903,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -934,16 +935,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const __m256 dx = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d)); + const __m256 dx = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d)); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); bxhi = __lasx_xvand_v(bxhi, __lasx_xvreplgr2vr_b(0x10)); qx = __lasx_xvor_v(qx, bxhi); - const __m256 dy = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib].d)); + const __m256 dy = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib].d)); const __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); const __m256 q = mul_sum_us8_pairs_float(qx, qy); @@ -973,7 +974,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -1003,7 +1004,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { // Compute combined scale for the block - const __m256 d = __lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __m256 d = __lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i qx = __lasx_xvld((const __m256i *)x[ib].qs, 0); __m256i qy = __lasx_xvld((const __m256i *)y[ib].qs, 0); @@ -1023,7 +1024,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -1047,8 +1048,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1116,8 +1117,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -1170,7 +1171,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales @@ -1294,7 +1295,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1330,8 +1331,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -1438,9 +1439,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1477,8 +1478,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * GGML_RESTRICT q5 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -1593,9 +1594,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1624,7 +1625,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q4 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -1713,7 +1714,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1780,7 +1781,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = __lasx_xvldi(0); @@ -1820,7 +1821,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -1895,7 +1896,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1980,7 +1981,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2049,7 +2050,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); @@ -2108,7 +2109,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -2168,7 +2169,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2213,7 +2214,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2279,7 +2280,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = (__m256)__lasx_xvldi(0); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; @@ -2340,7 +2341,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -2451,7 +2452,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(d), __lasx_xvffint_s_w(sumi), accum); accum1 += d * sumi1; } @@ -2484,7 +2485,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -2530,9 +2531,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p_1 = lasx_madd_h(p16_1, mone); const __m256i p_2 = lasx_madd_h(p16_2, mone); - accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), + accum1 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), __lasx_xvffint_s_w(p_1), accum1); - accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), + accum2 = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), __lasx_xvffint_s_w(p_2), accum2); } @@ -2540,7 +2541,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -2595,7 +2596,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v sumi1 = __lasx_xvadd_w(p_1, sumi1); sumi2 = __lasx_xvadd_w(p_2, sumi2); } - accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + accum = __lasx_xvfmadd_s(__lasx_xvreplfr2vr_s(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), __lasx_xvffint_s_w(__lasx_xvadd_w(sumi1, sumi2)), accum); } @@ -2604,7 +2605,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/powerpc/quants.c b/ggml/src/ggml-cpu/arch/powerpc/quants.c index ce4e47a86..053d5cbdc 100644 --- a/ggml/src/ggml-cpu/arch/powerpc/quants.c +++ b/ggml/src/ggml-cpu/arch/powerpc/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -67,7 +68,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float id = d ? 1.0f/d : 0.0f; const vector float vid = vec_splats(id); - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const vector float v = vec_round(vec_mul(srcv[j], vid)); @@ -112,7 +113,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float id = d ? 1.0f/d : 0.0f; const vector float vid = vec_splats(id); - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); vector int accv = vec_splats(0); @@ -127,7 +128,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i accv = vec_add(accv, vec_sld(accv, accv, 4)); accv = vec_add(accv, vec_sld(accv, accv, 8)); - y[i].s = GGML_FP32_TO_FP16(d * vec_extract(accv, 0)); + y[i].s = GGML_CPU_FP32_TO_FP16(d * vec_extract(accv, 0)); } #else @@ -170,8 +171,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); @@ -214,7 +215,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -249,12 +250,12 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m)); - vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f}; + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m)); + vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f}; vsumf0 = vec_madd(vxmin, vys, vsumf0); vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); @@ -291,7 +292,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -326,8 +327,8 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); vector signed long long aux64x2_0 = {(uint64_t)(table_b2b_1[x[ib].qh[0]]), (uint64_t)(table_b2b_1[x[ib].qh[1]])}; @@ -379,7 +380,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -415,12 +416,12 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m)); - vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f}; + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].m)); + vector float vys = {GGML_CPU_FP16_TO_FP32(y[ib].s), 0.f, 0.f, 0.f}; vsumf0 = vec_madd(vxmin, vys, vsumf0); vector unsigned long long aux64x2_0 = {(uint64_t)(table_b2b_0[x[ib].qh[0]]), (uint64_t)(table_b2b_0[x[ib].qh[1]])}; @@ -470,7 +471,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -502,8 +503,8 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); vector signed char q8x0 = vec_xl( 0, x[ib].qs); @@ -542,7 +543,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -574,11 +575,11 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); vector float vdmin = vec_mul(vxmin, vyd); vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); @@ -708,8 +709,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -770,7 +771,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -962,7 +963,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1005,11 +1006,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); vector float vdmin = vec_mul(vxmin, vyd); vector signed short q8ysums0 = vec_xl( 0, y[i].bsums); @@ -1177,9 +1178,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1222,11 +1223,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); - vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[i].dmin)); + vector float vxmin = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].dmin)); vector float vdmin = vec_mul(vxmin, vyd); UNUSED(kmask1); @@ -1394,9 +1395,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1432,7 +1433,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -1591,7 +1592,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1659,7 +1660,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -1742,7 +1743,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -1790,7 +1791,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -1871,7 +1872,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1939,7 +1940,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -2033,7 +2034,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -2096,7 +2097,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -2176,7 +2177,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2236,7 +2237,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo const vector signed char mask2 = (vector signed char)vec_xl( 0, k_mask2); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -2329,7 +2330,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -2394,7 +2395,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo vector float vsumf3 = vec_splats(0.0f); for (int i = 0; i < nb; ++i) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[i].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[i].d)); vector float vyd = vec_splats(y[i].d); vector float vd = vec_mul(vxd, vyd); @@ -2505,7 +2506,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -2546,8 +2547,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v __builtin_prefetch(y[ib].qs, 0, 1); - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); - vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d)); + vector float vyd = vec_splats(GGML_CPU_FP16_TO_FP32(y[ib].d)); vector float vd = vec_mul(vxd, vyd); vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); @@ -2582,7 +2583,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -2620,7 +2621,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v for (int ibl = 0; ibl < nb; ++ibl) { - vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ibl].d)); + vector float vxd = vec_splats(GGML_CPU_FP16_TO_FP32(x[ibl].d)); vector float vyd = vec_splats(y[ibl].d); vector float vd = vec_mul(vxd, vyd); @@ -2697,7 +2698,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c index 6f3aa94fb..8b64d8adc 100644 --- a/ggml/src/ggml-cpu/arch/riscv/quants.c +++ b/ggml/src/ggml-cpu/arch/riscv/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -45,7 +46,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl); @@ -85,7 +86,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); vfloat32m8_t x0 = __riscv_vfmul_vf_f32m8(v_x, id, vl); @@ -102,7 +103,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // set y[i].s int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); - y[i].s = GGML_FP32_TO_FP16(sum*d); + y[i].s = GGML_CPU_FP32_TO_FP16(sum*d); } #else @@ -160,7 +161,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } #endif @@ -177,7 +178,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -225,7 +226,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } #endif @@ -242,7 +243,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -293,7 +294,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl); int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum); - sumf += (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } #endif @@ -316,7 +317,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -366,7 +367,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi vint32m1_t sum = __riscv_vwredsum_vs_i16m4_i32m1(mul, zero, vl); int32_t sumi = __riscv_vmv_x_s_i32m1_i32(sum); - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } #endif @@ -389,7 +390,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -427,7 +428,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } #endif @@ -438,7 +439,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -465,8 +466,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * q2 = x[i].qs; const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); uint8_t *patmp = atmp; int vsums; int tmp; @@ -569,8 +570,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); size_t vl = 16; @@ -644,8 +645,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * q2 = x[i].qs; const int8_t * q8 = y[i].qs; const uint8_t * sc = x[i].scales; - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); uint8_t *patmp = atmp; int vsums; int tmp; @@ -750,8 +751,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -916,7 +917,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi q3 += 32; q8 += 128; scale += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; sumf += d * isum; } @@ -1017,7 +1018,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; sumf += d*sum_t; @@ -1134,7 +1135,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi q3 += 32; q8 += 128; scale += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; sumf += d * isum; } break; @@ -1202,7 +1203,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1239,8 +1240,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int tmp, tmp2, sumi; __asm__ __volatile__( @@ -1361,8 +1362,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi size_t vl = 8; - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); vint16mf2_t q8sums_0 = __riscv_vlse16_v_i16mf2(y[i].bsums, 4, vl); vint16mf2_t q8sums_1 = __riscv_vlse16_v_i16mf2(y[i].bsums+1, 4, vl); @@ -1422,8 +1423,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi break; case 128: for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int tmp, tmp2, sumi; __asm__ __volatile__( @@ -1580,9 +1581,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1627,8 +1628,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * GGML_RESTRICT hm = x[i].qh; const int8_t * GGML_RESTRICT q8 = y[i].qs; - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; vint16m1_t q8sums_0 = __riscv_vlse16_v_i16m1(y[i].bsums, 4, vl); vint16m1_t q8sums_1 = __riscv_vlse16_v_i16m1(y[i].bsums+1, 4, vl); @@ -1749,9 +1750,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1778,7 +1779,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -1862,7 +1863,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi case 256: for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q6 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -1943,7 +1944,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi case 128: for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * restrict q6 = x[i].ql; const uint8_t * restrict qh = x[i].qh; @@ -2058,7 +2059,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; diff --git a/ggml/src/ggml-cpu/arch/riscv/repack.cpp b/ggml/src/ggml-cpu/arch/riscv/repack.cpp index 0882b4102..45c91a694 100644 --- a/ggml/src/ggml-cpu/arch/riscv/repack.cpp +++ b/ggml/src/ggml-cpu/arch/riscv/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include @@ -90,16 +91,16 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const vfloat32m1_t facc = __riscv_vfcvt_f_x_v_f32m1(sumi_h8, vl / 4); // vector version needs Zvfhmin extension - const float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d); + const float a_scale = GGML_CPU_FP16_TO_FP32(a_ptr[l].d); const float b_scales[8] = { - GGML_FP16_TO_FP32(b_ptr[l].d[0]), - GGML_FP16_TO_FP32(b_ptr[l].d[1]), - GGML_FP16_TO_FP32(b_ptr[l].d[2]), - GGML_FP16_TO_FP32(b_ptr[l].d[3]), - GGML_FP16_TO_FP32(b_ptr[l].d[4]), - GGML_FP16_TO_FP32(b_ptr[l].d[5]), - GGML_FP16_TO_FP32(b_ptr[l].d[6]), - GGML_FP16_TO_FP32(b_ptr[l].d[7]) + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7]) }; const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1(facc, a_scale, vl / 4); @@ -129,7 +130,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -181,20 +182,20 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo // vector version needs Zvfhmin extension const float a_scales[4] = { - GGML_FP16_TO_FP32(a_ptr[l].d[0]), - GGML_FP16_TO_FP32(a_ptr[l].d[1]), - GGML_FP16_TO_FP32(a_ptr[l].d[2]), - GGML_FP16_TO_FP32(a_ptr[l].d[3]) + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(a_ptr[l].d[3]) }; const float b_scales[8] = { - GGML_FP16_TO_FP32(b_ptr[l].d[0]), - GGML_FP16_TO_FP32(b_ptr[l].d[1]), - GGML_FP16_TO_FP32(b_ptr[l].d[2]), - GGML_FP16_TO_FP32(b_ptr[l].d[3]), - GGML_FP16_TO_FP32(b_ptr[l].d[4]), - GGML_FP16_TO_FP32(b_ptr[l].d[5]), - GGML_FP16_TO_FP32(b_ptr[l].d[6]), - GGML_FP16_TO_FP32(b_ptr[l].d[7]) + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[0]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[1]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[2]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[3]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[4]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[5]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[6]), + GGML_CPU_FP16_TO_FP32(b_ptr[l].d[7]) }; const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1(b_scales, vl / 4); @@ -382,7 +383,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 26bd90875..a840219a4 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -49,7 +50,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f / d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const __vector float v = vec_mul(srcv[j], vec_splats(id)); @@ -94,7 +95,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f / d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); __vector int32_t acc = vec_splats(0); @@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i acc = vec_add(acc, vi); } - y[i].s = GGML_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3])); + y[i].s = GGML_CPU_FP32_TO_FP16(d * (acc[0] + acc[1] + acc[2] + acc[3])); } #else GGML_UNUSED(nb); @@ -164,7 +165,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi __vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_); const __vector float v_xy = vec_float(vec_unpackh(v_xy_)); - const __vector float v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); acc = vec_madd(v_xy, v_d, acc); } @@ -185,7 +186,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -219,7 +220,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi __builtin_prefetch(x[ib].qs, 0, 1); __builtin_prefetch(y[ib].qs, 0, 1); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); const uint8x16_t v_x = vec_xl(0, x[ib].qs); const int8x16_t v_xl = (const int8x16_t)(v_x & v_m); @@ -231,7 +232,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); const float32x4_t v_xy = vec_float(v_xy_); - const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); acc = vec_madd(v_xy, v_d, acc); } @@ -252,7 +253,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int32x4_t v_xy_ = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); const float32x4_t v_xy = vec_float(v_xy_); - const float32x4_t v_d = vec_splats(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); acc = vec_madd(v_xy, v_d, acc); } @@ -305,7 +306,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -348,7 +349,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sum = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * restrict x0l = x[i].qs; const uint8_t * restrict x0h = x[i].hmask; @@ -497,7 +498,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -537,8 +538,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); @@ -647,9 +648,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -698,8 +699,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const int16x8_t v_ysumsl = vec_xl(0 , y[i].bsums); const int16x8_t v_ysumsh = vec_xl(16, y[i].bsums); @@ -819,9 +820,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -859,7 +860,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi int8x16_t v_y[4]; for (int i = 0; i < nb; ++i) { - const float d_all = GGML_FP16_TO_FP32(x[i].d); + const float d_all = GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT x0l = x[i].ql; const uint8_t * GGML_RESTRICT x0h = x[i].qh; @@ -1004,7 +1005,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1071,7 +1072,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi // float sumf = 0; // for (int i = 0; i < nb; ++i) { -// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; +// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; // const uint16_t * GGML_RESTRICT q2 = x[i].qs; // const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1121,7 +1122,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi // float sumf = 0.f; // for (int i = 0; i < nb; ++i) { -// const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; +// const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; // const uint16_t * GGML_RESTRICT q2 = x[i].qs; // const int8_t * GGML_RESTRICT q8 = y[i].qs; // int32_t bsum = 0; @@ -1182,12 +1183,12 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs); const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh); - sumf += GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]); + sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]); } #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -1257,7 +1258,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2; } - sumf += GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); + sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2); } *s = sumf; @@ -1265,7 +1266,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/wasm/quants.c b/ggml/src/ggml-cpu/arch/wasm/quants.c index 4ec97f533..b0904d8a3 100644 --- a/ggml/src/ggml-cpu/arch/wasm/quants.c +++ b/ggml/src/ggml-cpu/arch/wasm/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -65,7 +66,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); for (int j = 0; j < 8; j++) { const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); @@ -110,7 +111,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); v128_t accv = wasm_i32x4_splat(0); @@ -126,7 +127,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i accv = wasm_i32x4_add(accv, vi); } - y[i].s = GGML_FP32_TO_FP16( + y[i].s = GGML_CPU_FP32_TO_FP16( d * (wasm_i32x4_extract_lane(accv, 0) + wasm_i32x4_extract_lane(accv, 1) + wasm_i32x4_extract_lane(accv, 2) + @@ -324,8 +325,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi ); // Accumulate results with scaling - float scale0 = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d); - float scale1 = GGML_FP16_TO_FP32(x1->d) * GGML_FP16_TO_FP32(y1->d); + float scale0 = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d); + float scale1 = GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d); sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp0), wasm_f32x4_splat(scale0))); sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(dp1), wasm_f32x4_splat(scale1))); @@ -348,7 +349,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -428,7 +429,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); + wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)))); } sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -454,7 +455,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -491,7 +492,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; - summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s); + summs += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -538,7 +539,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d)))); + wasm_f32x4_splat(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)))); } sumf = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -564,7 +565,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -620,7 +621,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const v128_t sum_dots = wasm_i32x4_add(wasm_i32x4_add(dx0_0, dx0_1), wasm_i32x4_add(dx1_0, dx1_1)); // Convert to float and accumulate - const float scale = GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d); + const float scale = GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d); sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(sum_dots), wasm_f32x4_splat(scale))); } @@ -635,7 +636,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -746,8 +747,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi isum += wasm_i32x4_extract_lane(isum_vec, 0); } - const float dall = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dall = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf += dall * isum - dmin * summs; } @@ -768,8 +769,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -880,7 +881,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi } // Accumulate results - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const v128_t v_d = wasm_f32x4_splat(d); v128_t v_sum = wasm_f32x4_add( wasm_f32x4_mul(wasm_f32x4_convert_i32x4(v_acc0), v_d), @@ -957,7 +958,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -991,8 +992,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Corrected sign const uint8_t * GGML_RESTRICT q4 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1136,9 +1137,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1170,8 +1171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi float sumf = 0; for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Fixed sign + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); // Fixed sign const uint8_t * GGML_RESTRICT q5 = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -1331,9 +1332,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1420,7 +1421,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi wasm_v128_store(&aux32[0], acc0); wasm_v128_store(&aux32[4], acc1); - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) { sums[l] += d * aux32[l]; } @@ -1470,7 +1471,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c index e3f722b52..e7527c00a 100644 --- a/ggml/src/ggml-cpu/arch/x86/quants.c +++ b/ggml/src/ggml-cpu/arch/x86/quants.c @@ -3,6 +3,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu.h" +#include "simd-mappings.h" #include "../../quants.h" #include "../../ggml-cpu-impl.h" @@ -256,9 +257,9 @@ static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_ // quad fp16 delta calculation static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) { - // GGML_FP16_TO_FP32 is faster than Intel F16C - return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)), - _mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0))); + // GGML_CPU_FP16_TO_FP32 is faster than Intel F16C + return _mm256_set_m128(_mm_set1_ps(GGML_CPU_FP16_TO_FP32(x1) * GGML_CPU_FP16_TO_FP32(y1)), + _mm_set1_ps(GGML_CPU_FP16_TO_FP32(x0) * GGML_CPU_FP16_TO_FP32(y0))); } #endif #elif defined(__SSSE3__) @@ -305,7 +306,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Quantize these floats const float d = maxScalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -401,7 +402,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Quantize these floats const float d = max_scalar / 127.f; - y[i].d = GGML_FP32_TO_FP16(d); + y[i].d = GGML_CPU_FP32_TO_FP16(d); const float id = ( max_scalar != 0.0f ) ? 127.f / max_scalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -425,7 +426,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i #if defined(__AVX2__) // Compute the sum of the quants and set y[i].s - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 @@ -455,7 +456,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i // Compute the sum of the quants and set y[i].s const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); + y[i].s = GGML_CPU_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); @@ -552,7 +553,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + const __m256 d = _mm256_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); __m256i qx = bytes_from_nibbles_32(x[ib].qs); @@ -613,7 +614,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi _mm_prefetch(&y[ib] + sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 0 and 1 - const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) ); + const __m128 d_0_1 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) ); const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[ib].qs); @@ -631,7 +632,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi _mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0); // Compute combined scale for the block 2 and 3 - const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[ib + 1].d) * GGML_FP16_TO_FP32(y[ib + 1].d) ); + const __m128 d_2_3 = _mm_set1_ps( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) ); const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs); @@ -680,7 +681,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -711,10 +712,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const float d0 = GGML_FP16_TO_FP32(x[ib].d); - const float d1 = GGML_FP16_TO_FP32(y[ib].d); + const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d); + const float d1 = GGML_CPU_FP16_TO_FP32(y[ib].d); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); const __m256 d0v = _mm256_set1_ps( d0 ); const __m256 d1v = _mm256_set1_ps( d1 ); @@ -752,7 +753,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -783,7 +784,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -807,7 +808,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { /* Compute combined scale for the block */ - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); const __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -851,7 +852,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -883,16 +884,16 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d)); + const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); __m256i qx = bytes_from_nibbles_32(x[ib].qs); __m256i bxhi = bytes_from_bits_32(x[ib].qh); bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); qx = _mm256_or_si256(qx, bxhi); - const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d)); + const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d)); const __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_us8_pairs_float(qx, qy); @@ -910,9 +911,9 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { - const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d)); + const __m256 dx = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d)); - summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); + summs += GGML_CPU_FP16_TO_FP32(x[ib].m) * GGML_CPU_FP16_TO_FP32(y[ib].s); __m256i bx_0 = bytes_from_nibbles_32(x[ib].qs); const __m256i bxhi = bytes_from_bits_32(x[ib].qh); @@ -926,7 +927,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi bxh = _mm_or_si128(bxh, bxhih); bx_0 = MM256_SET_M128I(bxh, bxl); - const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[ib].d)); + const __m256 dy = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib].d)); const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[ib].qs); const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0); @@ -956,7 +957,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -986,7 +987,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi // Main loop for (; ib < nb; ++ib) { // Compute combined scale for the block - const __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); + const __m256 d = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); __m256i qx = _mm256_loadu_si256((const __m256i *)x[ib].qs); __m256i qy = _mm256_loadu_si256((const __m256i *)y[ib].qs); @@ -1025,7 +1026,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -1144,7 +1145,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d)); sumi0 = _mm256_sub_epi16(sumi0, ysum); sumi0 = _mm256_add_epi16(sumi0, _mm256_add_epi16(sumi1, sumi2)); @@ -1190,7 +1191,7 @@ void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } - sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); } *s = sumf; @@ -1244,7 +1245,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } const __m256i ysum = _mm256_loadu_si256((const __m256i *) y[i].bsums); - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d)); sumi0 = _mm256_add_epi16(sumi0, sumi1); sumi0 = _mm256_sub_epi16(sumi0, ysum); @@ -1269,7 +1270,7 @@ void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo } } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); sumf += (float) sumi * d; } @@ -1299,8 +1300,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1366,8 +1367,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1477,8 +1478,8 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -1533,7 +1534,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1638,7 +1639,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q3 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -1824,7 +1825,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -1862,8 +1863,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -1928,8 +1929,8 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q4 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2049,9 +2050,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2092,8 +2093,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const uint8_t * GGML_RESTRICT q5 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -2170,8 +2171,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); const uint8_t * GGML_RESTRICT q5 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2311,9 +2312,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2344,7 +2345,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q4 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -2422,7 +2423,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int i = 0; i < nb; ++i) { - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); const uint8_t * GGML_RESTRICT q4 = x[i].ql; const uint8_t * GGML_RESTRICT qh = x[i].qh; @@ -2555,7 +2556,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -2622,7 +2623,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m256i sumi1 = _mm256_setzero_si256(); @@ -2663,7 +2664,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; __m128i sumi1_0 = _mm_setzero_si128(); @@ -2717,7 +2718,7 @@ void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -2792,7 +2793,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -2913,7 +2914,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3035,7 +3036,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3104,7 +3105,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); @@ -3177,7 +3178,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)(x[i].qs + QK_K/8); @@ -3253,7 +3254,7 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -3313,7 +3314,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3358,7 +3359,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3414,7 +3415,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -3480,7 +3481,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; @@ -3565,7 +3566,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint16_t * GGML_RESTRICT signs = (const uint16_t *)x[i].signs; @@ -3648,7 +3649,7 @@ void ggml_vec_dot_iq3_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -3753,7 +3754,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); accum = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi), accum); accum1 += d * sumi1; @@ -3801,7 +3802,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo + (y[i].bsums[2*ib+2] + y[i].bsums[2*ib+3]) * (qh[ib+1] & 0x8000 ? -1 : 1) * ls2; } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum); accum1 += d * sumi1; @@ -3835,7 +3836,7 @@ void ggml_vec_dot_iq1_s_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -3947,7 +3948,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 8; qh += 4; } - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16)); accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); @@ -4033,7 +4034,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qs += 8; qh += 4; } - const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); + const __m256 d = _mm256_set1_ps(y[i].d * GGML_CPU_FP16_TO_FP32(scale.f16)); accum1 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi1_1, sumi1_0))), accum1); accum2 = _mm256_add_ps(_mm256_mul_ps(d, _mm256_cvtepi32_ps(MM256_SET_M128I(sumi2_1, sumi2_0))), accum2); @@ -4083,7 +4084,7 @@ void ggml_vec_dot_iq1_m_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo qh += 2; } - sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); } *s = sumf; @@ -4129,9 +4130,9 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2); const __m256i p_1 = _mm256_madd_epi16(p16_1, mone); const __m256i p_2 = _mm256_madd_epi16(p16_2, mone); - accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)), + accum1 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 0].d)*GGML_CPU_FP16_TO_FP32(x[ib + 0].d)), _mm256_cvtepi32_ps(p_1), accum1); - accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)), + accum2 = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y[ib + 1].d)*GGML_CPU_FP16_TO_FP32(x[ib + 1].d)), _mm256_cvtepi32_ps(p_2), accum2); } @@ -4164,7 +4165,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v #endif for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -4219,7 +4220,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v sumi1 = _mm256_add_epi32(p_1, sumi1); sumi2 = _mm256_add_epi32(p_2, sumi2); } - accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + accum = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accum); } @@ -4267,7 +4268,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v } __m128i sumi12_0 = _mm_add_epi32(sumi1_0, sumi2_0); __m128i sumi12_1 = _mm_add_epi32(sumi1_1, sumi2_1); - accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[ibl].d)*y[ibl].d), + accum = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(x[ibl].d)*y[ibl].d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi12_1, sumi12_0))), accum); } @@ -4276,7 +4277,7 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v #else float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp index e7635a294..c00c1e541 100644 --- a/ggml/src/ggml-cpu/arch/x86/repack.cpp +++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include @@ -39,11 +40,11 @@ static inline __m512 __avx512_f32cx8x2_load(ggml_fp16_t *x, ggml_fp16_t *y) { float tmp[16]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } for (int i = 0; i < 8; i++) { - tmp[i + 8] = GGML_FP16_TO_FP32(y[i]); + tmp[i + 8] = GGML_CPU_FP16_TO_FP32(y[i]); } return _mm512_loadu_ps(tmp); @@ -54,10 +55,10 @@ static inline __m512 __avx512_repeat_f32cx16_load(__m128i x) { _mm_storeu_si128((__m128i*)tmphalf, x); for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 4] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 8] = GGML_FP16_TO_FP32(tmphalf[i]); - tmp[i + 12] = GGML_FP16_TO_FP32(tmphalf[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 4] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 8] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); + tmp[i + 12] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); } return _mm512_loadu_ps(tmp); @@ -67,7 +68,7 @@ static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { float tmp[8]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -76,8 +77,8 @@ static inline __m256 __avx_repeat_f32cx8_load(ggml_fp16_t *x) { float tmp[8]; for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); - tmp[i + 4] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); + tmp[i + 4] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -88,7 +89,7 @@ static inline __m256 __avx_rearranged_f32cx8_load(ggml_fp16_t *x, __m128i arrang _mm_storeu_si128((__m128i*)tmphalf, _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *) x), arrangeMask)); for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(tmphalf[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(tmphalf[i]); } return _mm256_loadu_ps(tmp); @@ -211,7 +212,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR id[row_iter] = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; //d ? 1.0f / d : 0.0f; // Store the scale for the individual block - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); // Store the values in blocks of eight values - Aim is to use these later for block interleaving srcv[row_iter][0] = v0; @@ -297,7 +298,7 @@ void ggml_quantize_mat_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -647,7 +648,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const __m256 col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask); // Load and convert to FP32 scale from block_q8_0 - const __m256 row_scale_f32 = _mm256_set1_ps(GGML_FP16_TO_FP32(a_ptr[b].d)); + const __m256 row_scale_f32 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(a_ptr[b].d)); // Load the block values in block_q8_0 in batches of 16 bytes and replicate the same across 256 bit vector __m256i lhs_vec_0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)a_ptr[b].qs)); @@ -706,7 +707,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -972,13 +973,13 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; } } for (int sb = 0; sb < 8; sb++) { uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; for (int j = 0; j < ncols_interleaved; j++) { - sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; + sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; } } } @@ -1755,7 +1756,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -3259,7 +3260,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; } } } @@ -3268,7 +3269,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo for(int m = 0; m < 4; m++) { const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); for(int j = 0; j < ncols_interleaved; j++) { - sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; + sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; } } } diff --git a/ggml/src/ggml-cpu/common.h b/ggml/src/ggml-cpu/common.h index 5624176cc..353563dc3 100644 --- a/ggml/src/ggml-cpu/common.h +++ b/ggml/src/ggml-cpu/common.h @@ -4,6 +4,7 @@ #include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "simd-mappings.h" #ifdef __cplusplus @@ -12,11 +13,11 @@ // convenience functions/macros for use in template calls // note: these won't be required after the 'traits' lookup table is used. static inline ggml_fp16_t f32_to_f16(float x) { - return GGML_FP32_TO_FP16(x); + return GGML_CPU_FP32_TO_FP16(x); } static inline float f16_to_f32(ggml_fp16_t x) { - return GGML_FP16_TO_FP32(x); + return GGML_CPU_FP16_TO_FP32(x); } static inline ggml_bf16_t f32_to_bf16(float x) { diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 73a8f9398..d839cf5c5 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -62,11 +62,17 @@ struct ggml_compute_params { #if defined(__s390x__) && defined(__VEC__) #ifndef __VXE__ #define __VXE__ -#endif +#endif // __VXE__ #ifndef __VXE2__ #define __VXE2__ -#endif -#endif +#endif // __VXE2__ +#endif // __s390x__ && __VEC__ + +#if defined(__s390x__) && defined(GGML_NNPA) +#ifndef __NNPA__ +#define __NNPA__ +#endif // __NNPA__ +#endif // __s390x__ && GGML_NNPA #if defined(__ARM_FEATURE_SVE) #include diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1d3cd009a..7cae96f4b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -72,6 +72,9 @@ #define UNUSED GGML_UNUSED #define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0) +// precomputed f32 table for f16 (256 KB) (simd-mappings.h) +float ggml_table_f32_f16[1 << 16]; + #if defined(__ARM_ARCH) struct ggml_arm_arch_features_type { int sve_cnt; @@ -736,7 +739,7 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value)); } } break; case GGML_TYPE_BF16: @@ -795,7 +798,7 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { { assert(tensor->nb[0] == sizeof(ggml_fp16_t)); for (int i = 0; i < n; i++) { - ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); + ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_CPU_FP32_TO_FP16(value)); } } break; case GGML_TYPE_BF16: @@ -846,7 +849,7 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } case GGML_TYPE_BF16: { @@ -891,7 +894,7 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { case GGML_TYPE_F16: { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -920,7 +923,7 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i case GGML_TYPE_I32: return ((int32_t *) data)[0]; case GGML_TYPE_F16: - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); case GGML_TYPE_BF16: return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); case GGML_TYPE_F32: @@ -947,7 +950,7 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -985,7 +988,7 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { } case GGML_TYPE_F16: { - return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } case GGML_TYPE_BF16: { @@ -1024,7 +1027,7 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(tensor->data))[i] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -1051,7 +1054,7 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, case GGML_TYPE_I32: return ((int32_t *) data)[0]; case GGML_TYPE_F16: - return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + return GGML_CPU_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); case GGML_TYPE_BF16: return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); case GGML_TYPE_F32: @@ -1078,7 +1081,7 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, } break; case GGML_TYPE_F16: { - ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); + ((ggml_fp16_t *)(data))[0] = GGML_CPU_FP32_TO_FP16(value); } break; case GGML_TYPE_BF16: { @@ -3141,9 +3144,24 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); _mm_storel_epi64((__m128i *)(y + i), y_vec); } +#elif defined(__NNPA__) + for (; i + 7 < n; i += 8) { + float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0)); + float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4)); + uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0); + uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); + vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); + } + for (; i + 3 < n; i += 4) { + float32x4_t v_x = vec_xl(0, (const float *)(x + i)); + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0); + uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0); + vec_xst(v_y, 0, (ggml_fp16_t *)(y + i)); + } #endif for (; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(x[i]); } } @@ -3167,9 +3185,25 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { __m128 y_vec = _mm_cvtph_ps(x_vec); _mm_storeu_ps(y + i, y_vec); } +#elif defined(__NNPA__) + for (; i + 7 < n; i += 8) { + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i)); + uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0); + float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0); + float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0); + vec_xst(v_yh, 0, (float *)(y + i + 0)); + vec_xst(v_yl, 0, (float *)(y + i + 4)); + } + for (; i + 3 < n; i += 4) { + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i)); + uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0); + float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0); + vec_xst(v_yh, 0, (float *)(y + i)); + } #endif + for (; i < n; ++i) { - y[i] = GGML_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP16_TO_FP32(x[i]); } } @@ -3369,6 +3403,14 @@ int ggml_cpu_has_vxe(void) { #endif } +int ggml_cpu_has_nnpa(void) { +#if defined(GGML_NNPA) + return 1; +#else + return 0; +#endif +} + int ggml_cpu_has_neon(void) { #if defined(__ARM_ARCH) && defined(__ARM_NEON) return 1; @@ -3418,7 +3460,7 @@ int ggml_cpu_has_sme(void) { } void ggml_cpu_init(void) { - // needed to initialize f16 tables + // needed to initialize ggml_time { struct ggml_init_params params = { 0, NULL, false }; struct ggml_context * ctx = ggml_init(params); @@ -3439,9 +3481,10 @@ void ggml_cpu_init(void) { uint16_t u16; ggml_fp16_t fp16; } u = {i}; - float f = GGML_FP16_TO_FP32(u.fp16); - ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); - ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); + float f = GGML_COMPUTE_FP16_TO_FP32(u.fp16); + ggml_table_f32_f16[i] = f; + ggml_table_gelu_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_f32(f)); + ggml_table_gelu_quick_f16[i] = GGML_CPU_FP32_TO_FP16(ggml_gelu_quick_f32(f)); } const uint64_t t_end = ggml_time_us(); UNUSED(t_end); diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 735ef3f01..a98866a2d 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -578,6 +578,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_vxe()) { features.push_back({ "VXE", "1" }); } + if (ggml_cpu_has_nnpa()) { + features.push_back({ "NNPA", "1" }); + } if (ggml_cpu_has_wasm_simd()) { features.push_back({ "WASM_SIMD", "1" }); } diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 7ed3874af..ed61869a5 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -52,6 +52,7 @@ #include "ggml-impl.h" #include "ggml-cpu-impl.h" #include "ggml-quants.h" +#include "simd-mappings.h" #include #include @@ -73,7 +74,7 @@ namespace { inline float unhalf(ggml_fp16_t d) { - return GGML_FP16_TO_FP32(d); + return GGML_CPU_FP16_TO_FP32(d); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -252,7 +253,7 @@ template <> inline float32x4_t load(const ggml_fp16_t * p) { float tmp[4]; for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(p[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(p[i]); } return vec_xl(0, (const float *)(tmp)); diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index eff4a53e3..8531baf6c 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -108,7 +108,7 @@ static void ggml_compute_forward_dup_f16( for (int i01 = ir0; i01 < ir1; i01++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]); + dst_ptr[id] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]); id++; } } @@ -130,7 +130,7 @@ static void ggml_compute_forward_dup_f16( const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]); + src0_f32[i00] = GGML_CPU_FP16_TO_FP32(src0_ptr[i00]); } quantize_row_q(src0_f32, dst_ptr + id, ne00); @@ -156,7 +156,7 @@ static void ggml_compute_forward_dup_f16( for (int i00 = 0; i00 < ne00; i00++) { const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr); + dst_ptr[id] = GGML_CPU_FP16_TO_FP32(*src0_ptr); id++; } } @@ -267,7 +267,7 @@ static void ggml_compute_forward_dup_f16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); + *(float *) dst_ptr = GGML_CPU_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -372,7 +372,7 @@ static void ggml_compute_forward_dup_bf16( for (int i01 = ir0; i01 < ir1; i01++) { const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); for (int i00 = 0; i00 < ne00; i00++) { - dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); id++; } } @@ -473,7 +473,7 @@ static void ggml_compute_forward_dup_bf16( for (int i00 = 0; i00 < ne00; i00++) { const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); id++; } } @@ -566,7 +566,7 @@ static void ggml_compute_forward_dup_bf16( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); + *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); if (++i10 == ne0) { i10 = 0; @@ -765,7 +765,7 @@ static void ggml_compute_forward_dup_f32( for (int i00 = 0; i00 < ne00; i00++) { const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + dst_ptr[id] = GGML_CPU_FP32_TO_FP16(*src0_ptr); id++; } } @@ -878,7 +878,7 @@ static void ggml_compute_forward_dup_f32( const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); - *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr); + *(ggml_fp16_t *) dst_ptr = GGML_CPU_FP32_TO_FP16(*(const float *) src0_ptr); if (++i10 == ne0) { i10 = 0; @@ -1419,7 +1419,7 @@ static void ggml_compute_forward_add1_f16_f32( ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v); } } } @@ -1435,7 +1435,7 @@ static void ggml_compute_forward_add1_f16_f16( GGML_ASSERT(ggml_is_scalar(src1)); // scalar to add - const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); + const float v = GGML_CPU_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); const int ith = params->ith; const int nth = params->nth; @@ -1467,7 +1467,7 @@ static void ggml_compute_forward_add1_f16_f16( ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); for (int i = 0; i < ne0; i++) { - dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + v); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(src0_ptr[i]) + v); } } } @@ -1889,7 +1889,7 @@ static void ggml_compute_forward_sum_f16( } } } - ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); + ((ggml_fp16_t *) dst->data)[0] = GGML_CPU_FP32_TO_FP16(sum); } static void ggml_compute_forward_sum_bf16( @@ -2660,7 +2660,7 @@ static void ggml_compute_forward_gelu_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2763,7 +2763,7 @@ static void ggml_compute_forward_gelu_erf_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2866,7 +2866,7 @@ static void ggml_compute_forward_gelu_quick_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -2969,7 +2969,7 @@ static void ggml_compute_forward_silu_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -3163,7 +3163,7 @@ static void ggml_compute_forward_silu_back_f16( #ifndef NDEBUG for (int k = 0; k < nc; k++) { const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); GGML_UNUSED(v); assert(!isnan(v)); assert(!isinf(v)); @@ -4500,7 +4500,7 @@ static void ggml_compute_forward_get_rows_back_f32_f16( for (int j = 0; j < nc; ++j) { ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + i*src0->nb[1]))[j]; - ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_FP16_TO_FP32(v); + ((float *) ((char *) dst->data + r*dst->nb[1]))[j] += GGML_CPU_FP16_TO_FP32(v); } } } @@ -4792,7 +4792,7 @@ static void ggml_compute_forward_soft_max_f32( if (mp_f32) { if (use_f16) { for (int i = 0; i < nc; ++i) { - wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]); + wp[i] += slope*GGML_CPU_FP16_TO_FP32(mp_f16[i]); } } else { for (int i = 0; i < nc; ++i) { @@ -5018,8 +5018,8 @@ static void ggml_compute_forward_clamp_f16( ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01); for (int i = 0; i < nc; i++) { - float v = GGML_FP16_TO_FP32(src0_ptr[i]); - dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min)); + float v = GGML_CPU_FP16_TO_FP32(src0_ptr[i]); + dst_ptr[i] = GGML_CPU_FP32_TO_FP16(MAX(MIN(v, max), min)); } } } @@ -5476,11 +5476,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { for (int64_t i0 = 0; i0 < n_dims; i0 += 2) { @@ -5492,11 +5492,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims/2]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims/2] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } } else { @@ -5507,11 +5507,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[1]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[1]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[1] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } @@ -5525,11 +5525,11 @@ static void ggml_compute_forward_rope_f16( const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0); - const float x0 = GGML_FP16_TO_FP32(src[0]); - const float x1 = GGML_FP16_TO_FP32(src[n_dims]); + const float x0 = GGML_CPU_FP16_TO_FP32(src[0]); + const float x1 = GGML_CPU_FP16_TO_FP32(src[n_dims]); - dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); - dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); + dst_data[0] = GGML_CPU_FP32_TO_FP16(x0*cos_theta - x1*sin_theta); + dst_data[n_dims] = GGML_CPU_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); } } else { for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) { @@ -5640,7 +5640,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( for (int64_t i11 = 0; i11 < ne11; i11++) { const float * const src = (float *)((char *) src1->data + i11*nb11); for (int64_t i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne11 + i11] = GGML_FP32_TO_FP16(src[i10]); + dst_data[i10*ne11 + i11] = GGML_CPU_FP32_TO_FP16(src[i10]); } } } @@ -5933,7 +5933,7 @@ static void ggml_compute_forward_im2col_f16( if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0; } else { - dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(src_data[iih*IW + iiw]); } } } @@ -6109,7 +6109,7 @@ void ggml_compute_forward_conv_transpose_2d( const float * const src = (float *)((char *) src1->data + i12*nb12 + i11*nb11); ggml_fp16_t * dst_data = wdata + i11*ne10*ne12; for (int i10 = 0; i10 < ne10; i10++) { - dst_data[i10*ne12 + i12] = GGML_FP32_TO_FP16(src[i10]); + dst_data[i10*ne12 + i12] = GGML_CPU_FP32_TO_FP16(src[i10]); } } } @@ -6358,7 +6358,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( case GGML_OP_POOL_COUNT: GGML_ABORT("fatal error"); } for (int ki = 0; ki < k; ++ki) { - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { case GGML_OP_POOL_AVG: drow[i] += srow_j; break; case GGML_OP_POOL_MAX: if (srow_j > drow[i]) drow[i] = srow_j; break; @@ -6450,7 +6450,7 @@ void ggml_compute_forward_pool_2d( for (int kx = 0; kx < k0; ++kx) { int j = ix + kx; if (j < 0 || j >= src->ne[0]) continue; - const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); + const float srow_j = (src->type == GGML_TYPE_F32) ? ((const float*)srow)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t*)srow)[j]); switch (op) { case GGML_OP_POOL_AVG: *out += srow_j; break; case GGML_OP_POOL_MAX: if (srow_j > *out) *out = srow_j; break; @@ -6538,7 +6538,7 @@ void ggml_compute_forward_pool_2d_back( } const float val = dst->type == GGML_TYPE_F32 ? - ((const float *) drowf)[j] : GGML_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); + ((const float *) drowf)[j] : GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drowf)[j]); if (val <= maxval) { continue; } @@ -6558,7 +6558,7 @@ void ggml_compute_forward_pool_2d_back( if (dst->type == GGML_TYPE_F32) { ((float *) drow)[j] += grad0; } else { - ((ggml_fp16_t *) drow)[j] = GGML_FP32_TO_FP16(grad0 + GGML_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); + ((ggml_fp16_t *) drow)[j] = GGML_CPU_FP32_TO_FP16(grad0 + GGML_CPU_FP16_TO_FP32(((const ggml_fp16_t *) drow)[j])); } } else if (op == GGML_OP_POOL_AVG) { const float grad = grad0 / ka; @@ -6577,7 +6577,7 @@ void ggml_compute_forward_pool_2d_back( if (dst->type == GGML_TYPE_F32) { ((float *) drow)[j] += grad; } else { - ((ggml_fp16_t *) drow)[j] += GGML_FP32_TO_FP16(grad); + ((ggml_fp16_t *) drow)[j] += GGML_CPU_FP32_TO_FP16(grad); } } } @@ -7142,7 +7142,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( // loop over n_kv and n_head_kv // ref: https://arxiv.org/pdf/2112.05682.pdf for (int64_t ic = 0; ic < nek1; ++ic) { - const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f; + const float mv = mp ? slope*GGML_CPU_FP16_TO_FP32(mp[ic]) : 0.0f; if (mv == -INFINITY) { continue; } @@ -7210,7 +7210,7 @@ static void ggml_compute_forward_flash_attn_ext_f16( if (v->type == GGML_TYPE_F16) { for (int64_t d = 0; d < DV; ++d) { - VKQ32[d] = GGML_FP16_TO_FP32(VKQ16[d]); + VKQ32[d] = GGML_CPU_FP16_TO_FP32(VKQ16[d]); } } diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index d2e705f28..ee35ab42f 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -2,6 +2,7 @@ #include "ggml-common.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "ggml-quants.h" #include "quants.h" @@ -137,7 +138,7 @@ void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); + sumf += sumi*GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d); } *s = sumf; @@ -174,7 +175,7 @@ void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -217,7 +218,7 @@ void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)) * sumi; } *s = sumf; @@ -260,7 +261,7 @@ void ggml_vec_dot_q5_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, c } int sumi = sumi0 + sumi1; - sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); + sumf += (GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d))*sumi + GGML_CPU_FP16_TO_FP32(x[ib].m)*GGML_CPU_FP16_TO_FP32(y[ib].s); } *s = sumf; @@ -290,7 +291,7 @@ void ggml_vec_dot_q8_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c sumi += x[ib].qs[j]*y[ib].qs[j]; } - sumf += sumi*(GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)); + sumf += sumi*(GGML_CPU_FP16_TO_FP32(x[ib].d)*GGML_CPU_FP16_TO_FP32(y[ib].d)); } *s = sumf; @@ -342,7 +343,7 @@ void ggml_vec_dot_tq1_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } - sumf += (float) sum * (GGML_FP16_TO_FP32(x[i].d) * y[i].d); + sumf += (float) sum * (GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d); } *s = sumf; @@ -372,7 +373,7 @@ void ggml_vec_dot_tq2_0_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, } } - const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); + const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); sumf += (float) sumi * d; } @@ -405,8 +406,8 @@ void ggml_vec_dot_q2_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c summs += y[i].bsums[j] * (sc[j] >> 4); } - const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); + const float dall = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d); + const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin); int isum = 0; int is = 0; @@ -504,7 +505,7 @@ void ggml_vec_dot_q3_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -577,9 +578,9 @@ void ggml_vec_dot_q4_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -657,9 +658,9 @@ void ggml_vec_dot_q5_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; - const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; sumf -= dmin * sumi; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -714,7 +715,7 @@ void ggml_vec_dot_q6_K_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, c for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; q8 += 8; a += 8; } - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; } for (int l = 0; l < 8; ++l) sumf += sums[l]; @@ -739,7 +740,7 @@ void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t bsum = 0; @@ -778,7 +779,7 @@ void ggml_vec_dot_iq2_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint16_t * GGML_RESTRICT q2 = x[i].qs; const uint8_t * GGML_RESTRICT sc = x[i].scales; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -829,7 +830,7 @@ void ggml_vec_dot_iq2_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0; for (int i = 0; i < nb; i++) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const int8_t * q8 = y[i].qs; const uint8_t * qs = x[i].qs; const uint8_t * qh = x[i].qh; @@ -882,7 +883,7 @@ void ggml_vec_dot_iq3_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT q3 = x[i].qs; const uint8_t * GGML_RESTRICT gas = x[i].qs + QK_K/4; const int8_t * GGML_RESTRICT q8 = y[i].qs; @@ -924,7 +925,7 @@ void ggml_vec_dot_iq3_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0.f; for (int i = 0; i < nb; ++i) { - const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; const uint8_t * GGML_RESTRICT qs = x[i].qs; const uint8_t * GGML_RESTRICT qh = x[i].qh; const uint8_t * GGML_RESTRICT signs = x[i].signs; @@ -1002,7 +1003,7 @@ void ggml_vec_dot_iq1_s_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, qs += 4; } - sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); + sumf += GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d * (sumi + IQ1S_DELTA * sumi1); } *s = sumf; @@ -1063,7 +1064,7 @@ void ggml_vec_dot_iq1_m_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, qh += 2; } - sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); + sumf += GGML_CPU_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); } *s = sumf; @@ -1087,7 +1088,7 @@ void ggml_vec_dot_iq4_nl_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0; for (; ib < nb; ++ib) { - const float d = GGML_FP16_TO_FP32(y[ib].d)*GGML_FP16_TO_FP32(x[ib].d); + const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_CPU_FP16_TO_FP32(x[ib].d); int sumi1 = 0, sumi2 = 0; for (int j = 0; j < QK4_NL/2; ++j) { sumi1 += y[ib].qs[j+ 0] * kvalues_iq4nl[x[ib].qs[j] & 0xf]; @@ -1113,7 +1114,7 @@ void ggml_vec_dot_iq4_xs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, float sumf = 0; for (int ibl = 0; ibl < nb; ++ibl) { - const float d4d8 = GGML_FP16_TO_FP32(x[ibl].d) * y[ibl].d; + const float d4d8 = GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d; uint16_t h = x[ibl].scales_h; const uint8_t * qs = x[ibl].qs; const int8_t * q8 = y[ibl].qs; diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 692c53e01..72ee93a5a 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include "arch-fallback.h" @@ -72,7 +73,7 @@ void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GG const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -110,7 +111,7 @@ void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GG const float d = amax / ((1 << 7) - 1); id[row_iter] = d ? 1.0f / d : 0.0f; - y[i].d[row_iter] = GGML_FP32_TO_FP16(d); + y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); } for (int j = 0; j < QK8_0 * 4; j++) { @@ -236,7 +237,7 @@ void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -280,7 +281,7 @@ void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -325,7 +326,7 @@ void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -396,13 +397,13 @@ void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; } } for (int sb = 0; sb < 8; sb++) { uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; for (int j = 0; j < ncols_interleaved; j++) { - sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; + sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; } } } @@ -449,7 +450,7 @@ void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); } - sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d); + sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); } } } @@ -500,7 +501,7 @@ void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -555,7 +556,7 @@ void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -609,7 +610,7 @@ void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } @@ -688,7 +689,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, sumi2 = sumi2 * scales_1[j]; sumi += sumi1 + sumi2; } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; } } } @@ -697,7 +698,7 @@ void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, for(int m = 0; m < 4; m++) { const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); for(int j = 0; j < ncols_interleaved; j++) { - sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; + sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; } } } @@ -753,7 +754,7 @@ void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); } - sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]); + sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); } } } diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index e42364c59..b68ac0dd6 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -2,10 +2,167 @@ #include "ggml-cpu-impl.h" +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + +#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include +#endif + +#if defined(__F16C__) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + // // simd mappings // +// FP16 to FP32 conversion + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +// +// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 +// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 +// +#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x) + + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + + static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; + } + + static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __fp16 tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; + } +#elif defined(__F16C__) + #ifdef _MSC_VER + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) + #else + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) + #endif +#elif defined(__POWER9_VECTOR__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x) + /* the inline asm below is about 12% faster than the lookup method */ + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) + + static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) { + float f; + double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; + } + + static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) { + double d; + ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; + } +#elif defined(__riscv) && defined(__riscv_zfhmin) + static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) { + float f; + __asm__( + "fmv.h.x %[f], %[h]\n\t" + "fcvt.s.h %[f], %[f]" + : [f] "=&f" (f) + : [h] "r" (h) + ); + return f; + } + + static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __asm__( + "fcvt.h.s %[f], %[f]\n\t" + "fmv.x.h %[h], %[f]" + : [h] "=&r" (res) + : [f] "f" (f) + ); + return res; + } + + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x) + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) +#elif defined(__NNPA__) + #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x) + #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x) + + #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x) + #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x) + + static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) { + uint16x8_t v_h = vec_splats(h); + uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0); + return vec_extend_to_fp32_hi(v_hd, 0)[0]; + } + + static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) { + float32x4_t v_f = vec_splats(f); + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0); + uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0); + return vec_extract(v_h, 0); + } +#endif + +// precomputed f32 table for f16 (256 KB) +// defined in ggml-cpu.c, initialized in ggml_cpu_init() +extern float ggml_table_f32_f16[1 << 16]; + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_CPU_FP16_TO_FP32) +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +#define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#endif + +#if !defined(GGML_CPU_FP32_TO_FP16) +#define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif + + // we define a common set of C macros which map to specific intrinsics based on the current architecture // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros @@ -415,7 +572,7 @@ static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) { float tmp[8]; for (int i = 0; i < 8; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } return _mm256_loadu_ps(tmp); @@ -426,7 +583,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { _mm256_storeu_ps(arr, y); for (int i = 0; i < 8; i++) - x[i] = GGML_FP32_TO_FP16(arr[i]); + x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); } #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) @@ -574,10 +731,10 @@ static inline unsigned char ggml_endian_byte(int i) { inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(p[0]); - tmp[1] = GGML_FP16_TO_FP32(p[1]); - tmp[2] = GGML_FP16_TO_FP32(p[2]); - tmp[3] = GGML_FP16_TO_FP32(p[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]); return wasm_v128_load(tmp); } @@ -587,10 +744,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { wasm_v128_store(tmp, x); - p[0] = GGML_FP32_TO_FP16(tmp[0]); - p[1] = GGML_FP32_TO_FP16(tmp[1]); - p[2] = GGML_FP32_TO_FP16(tmp[2]); - p[3] = GGML_FP32_TO_FP16(tmp[3]); + p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]); + p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]); + p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]); + p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]); } #define GGML_F16x4 v128_t @@ -690,10 +847,10 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); return _mm_loadu_ps(tmp); } @@ -703,10 +860,10 @@ static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) { _mm_storeu_ps(arr, y); - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); + x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); + x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); + x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); + x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); } #define GGML_F32Cx4 __m128 @@ -828,7 +985,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { #define GGML_F32x4_ZERO __lsx_vldi(0) #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0) #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0) -#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0) +#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0) #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) #define GGML_F32x4_ADD __lsx_vfadd_s #define GGML_F32x4_MUL __lsx_vfmul_s @@ -874,10 +1031,10 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) { float tmp[4]; - tmp[0] = GGML_FP16_TO_FP32(x[0]); - tmp[1] = GGML_FP16_TO_FP32(x[1]); - tmp[2] = GGML_FP16_TO_FP32(x[2]); - tmp[3] = GGML_FP16_TO_FP32(x[3]); + tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]); + tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]); + tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]); + tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]); return __lsx_vld(tmp, 0); } @@ -887,10 +1044,10 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { __lsx_vst(y, arr, 0); - x[0] = GGML_FP32_TO_FP16(arr[0]); - x[1] = GGML_FP32_TO_FP16(arr[1]); - x[2] = GGML_FP32_TO_FP16(arr[2]); - x[3] = GGML_FP32_TO_FP16(arr[3]); + x[0] = GGML_CPU_FP32_TO_FP16(arr[0]); + x[1] = GGML_CPU_FP32_TO_FP16(arr[1]); + x[2] = GGML_CPU_FP32_TO_FP16(arr[2]); + x[3] = GGML_CPU_FP32_TO_FP16(arr[3]); } #define GGML_F32Cx4 __m128 @@ -922,7 +1079,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F32_STEP 32 #define GGML_F32_EPR 4 -#define GGML_F32x4 __vector float +#define GGML_F32x4 float32x4_t #define GGML_F32x4_ZERO vec_splats(0.0f) #define GGML_F32x4_SET1 vec_splats #define GGML_F32x4_LOAD(p) vec_xl(0, p) @@ -962,28 +1119,45 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) { #define GGML_F16_STEP GGML_F32_STEP #define GGML_F16_EPR GGML_F32_EPR -static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) { +static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) { +#if defined(__NNPA__) + uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x); + uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0); + return vec_extend_to_fp32_hi(v_xd, 0); +#else float tmp[4]; for (int i = 0; i < 4; i++) { - tmp[i] = GGML_FP16_TO_FP32(x[i]); + tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]); } // note: keep type-cast here to prevent compiler bugs // see: https://github.com/ggml-org/llama.cpp/issues/12846 return vec_xl(0, (const float *)(tmp)); +#endif } -static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) { +static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) { +#if defined(__NNPA__) + float32x4_t v_zero = vec_splats(0.0f); + uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0); + uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0); + + x[0] = vec_extract(v_x, 0); + x[1] = vec_extract(v_x, 1); + x[2] = vec_extract(v_x, 2); + x[3] = vec_extract(v_x, 3); +#else float arr[4]; // note: keep type-cast here to prevent compiler bugs // see: https://github.com/ggml-org/llama.cpp/issues/12846 - vec_xst(y, 0, (float *)(arr)); + vec_xst(v_y, 0, (float *)(arr)); for (int i = 0; i < 4; i++) { - x[i] = GGML_FP32_TO_FP16(arr[i]); + x[i] = GGML_CPU_FP32_TO_FP16(arr[i]); } +#endif } #define GGML_F16_VEC GGML_F32x4 @@ -1004,3 +1178,7 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) { #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR) #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) #endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index f7614568e..5e34d79a1 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -219,11 +219,11 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G // leftovers for (int i = np; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); } #else for (int i = 0; i < n; ++i) { - sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); + sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); } #endif diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 09dbade21..84f6c0e6d 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -58,7 +58,7 @@ inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } @@ -67,7 +67,7 @@ inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; } inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; } @@ -75,20 +75,20 @@ inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; } inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i])); + y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i])); } } inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i])); } } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) { for (int i = 0; i < n; ++i) { - z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i])); + z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i])); } } @@ -131,13 +131,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG // leftovers for (int i = np; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); } } #else for (int i = 0; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); + sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); } } #endif @@ -280,12 +280,12 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, // leftovers for (int i = np; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); } #else // scalar for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); } #endif } @@ -430,12 +430,12 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float // leftovers for (int i = np; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); } #else // scalar for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); } #endif } @@ -444,103 +444,103 @@ inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v*v); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v*v); } } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); } inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); } inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); } inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f)); } } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); + y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f); } } inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i]))); } } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f); } } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f)); } } inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i])))); + y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i])))); } } // TODO: optimize performance inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f))); } } inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); } inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); + y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f))); } } inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); } inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i]))); + y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i]))); } } @@ -562,9 +562,9 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float xi = GGML_FP16_TO_FP32(x[i]); + float xi = GGML_CPU_FP16_TO_FP32(x[i]); float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); - y[i] = GGML_FP32_TO_FP16(res); + y[i] = GGML_CPU_FP32_TO_FP16(res); } } @@ -577,9 +577,9 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { } else if (x[i] >= 10.0f) { y[i] = x[i]; } else { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]); + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]); } } } @@ -613,9 +613,9 @@ inline static float ggml_gelu_quick_f32(float x) { inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { uint16_t t; for (int i = 0; i < n; ++i) { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]); } } #else @@ -628,8 +628,8 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v)))); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v)))); } } @@ -638,8 +638,8 @@ inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); } inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) { - float v = GGML_FP16_TO_FP32(x); - return GGML_FP32_TO_FP16(v/(1.0f + expf(-v))); + float v = GGML_CPU_FP16_TO_FP32(x); + return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v))); } #if __FINITE_MATH_ONLY__ @@ -888,9 +888,9 @@ inline static float ggml_silu_backward_f32(float x, float dy) { } inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) { - const float v = GGML_FP16_TO_FP32(x); + const float v = GGML_CPU_FP16_TO_FP32(x); const float s = 1.0f/(1.0f + expf(-v)); - return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); + return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s))); } inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) { @@ -928,7 +928,7 @@ inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) { float sum = 0.0f; for (int i = 0; i < n; ++i) { - sum += GGML_FP16_TO_FP32(x[i]); + sum += GGML_CPU_FP16_TO_FP32(x[i]); } *s = sum; } diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 6dc5ce0d9..57761644f 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -317,203 +317,81 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); GGML_API void * ggml_aligned_malloc(size_t size); GGML_API void ggml_aligned_free(void * ptr, size_t size); -// FP16 to FP32 conversion +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -// -// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 -// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 -// -#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - - #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - __fp16 tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - __fp16 tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; - } - -#elif defined(__F16C__) - - #ifdef _MSC_VER - #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) - #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) - #else - #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) - #endif - -#elif defined(__POWER9_VECTOR__) - - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - /* the inline asm below is about 12% faster than the lookup method */ - #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) - #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - float f; - double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - double d; - ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; - } - -#elif defined(__riscv) && defined(__riscv_zfhmin) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - float f; - __asm__( - "fmv.h.x %[f], %[h]\n\t" - "fcvt.s.h %[f], %[f]" - : [f] "=&f" (f) - : [h] "r" (h) - ); - return f; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - __asm__( - "fcvt.h.s %[f], %[f]\n\t" - "fmv.x.h %[h], %[f]" - : [h] "=&r" (res) - : [f] "f" (f) - ); - return res; - } - - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) - #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - -#else - - // FP16 <-> FP32 - // ref: https://github.com/Maratyszcza/FP16 - - static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; - } - - static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; - } - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; - #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) - const float exp_scale = 0x1.0p-112f; - #else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); - #endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; - #else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); - #endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); - } - - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) - -// precomputed f32 table for f16 (256 KB) -// defined in ggml.c, initialized in ggml_init() -GGML_API float ggml_table_f32_f16[1 << 16]; - -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return ggml_table_f32_f16[s]; +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; } -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#endif +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} -#if !defined(GGML_FP32_TO_FP16) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float exp_scale = 0x1.0p-112f; +#else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); #endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) /** * Converts brain16 to float32. diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f8e7c595b..ee605977f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -61,9 +61,6 @@ #define m512i(p) (__m512i)(p) #endif -// precomputed f32 table for f16 (256 KB) (ggml-impl.h) -float ggml_table_f32_f16[1 << 16]; - #if defined(__linux__) || \ defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ (defined(__APPLE__) && !TARGET_OS_TV && !TARGET_OS_WATCH) @@ -1422,14 +1419,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { // initialize time system (required on Windows) ggml_time_init(); - for (int i = 0; i < (1 << 16); ++i) { - union { - uint16_t u16; - ggml_fp16_t fp16; - } u = {i}; - ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); - } - is_first_call = false; } From 716301d1b03c31875ec3b24526c48c8b1bd0fd8c Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Thu, 26 Jun 2025 12:11:59 +0800 Subject: [PATCH 12/54] musa: enable fp16 mma (all) and cublas on qy2 (#13842) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * musa: enable fp16 mma (all) and cublas on qy2 Signed-off-by: Xiaodong Ye * Update ggml/src/ggml-cuda/ggml-cuda.cu Co-authored-by: Johannes Gäßler * Address review comments Signed-off-by: Xiaodong Ye * Address review comments Signed-off-by: Xiaodong Ye * musa: disable MUL_MAT_ID (q2_k × f32) due to precision issues Signed-off-by: Xiaodong Ye --------- Signed-off-by: Xiaodong Ye Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/common.cuh | 25 +++++++++++++------------ ggml/src/ggml-cuda/fattn-wmma-f16.cu | 4 ++++ ggml/src/ggml-cuda/ggml-cuda.cu | 25 +++++++++++++++---------- ggml/src/ggml-musa/mudnn.cuh | 4 ++-- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index f6127aeee..ea2035502 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -76,11 +76,9 @@ #define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1) // Moore Threads -#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210) - -#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 -#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 -#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD +#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000 +#define GGML_CUDA_CC_QY2 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000 +#define GGML_CUDA_CC_NG (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD) #define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2) @@ -203,9 +201,9 @@ typedef float2 dfloat2; #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) #define FP16_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA +#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA) || defined(GGML_USE_MUSA) #if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4))) #define FP16_MMA_AVAILABLE @@ -219,9 +217,9 @@ typedef float2 dfloat2; #define CP_ASYNC_AVAILABLE #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE -#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1) +#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) #define FLASH_ATTN_AVAILABLE -#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1) +#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ < 220) static bool fp16_available(const int cc) { return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL; @@ -233,7 +231,8 @@ static bool fast_fp16_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fast_fp16_hardware_available(const int cc) { - return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc); + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } // Any FP16 tensor core instructions are available for ggml code. @@ -242,7 +241,8 @@ static bool fp16_mma_available(const int cc) { return false; #else if ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) || - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc)) { + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || + GGML_CUDA_CC_IS_MTHREADS(cc)) { return true; } else if (GGML_CUDA_CC_IS_RDNA4(cc)) { #if defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_HIP_ROCWMMA_FATTN_GFX12) @@ -259,7 +259,8 @@ static bool fp16_mma_available(const int cc) { // To be used for feature selection of external libraries, e.g. cuBLAS. static bool fp16_mma_hardware_available(const int cc) { return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || - GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc); + GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); } static bool bf16_mma_hardware_available(const int cc) { diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index c5668adb1..f3b794c36 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -9,7 +9,11 @@ #ifdef FP16_MMA_AVAILABLE #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #include +#ifdef GGML_USE_MUSA +namespace wmma = mtmusa::wmma; +#else // GGML_USE_MUSA namespace wmma = nvcuda::wmma; +#endif // GGML_USE_MUSA #elif defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE) #undef HIP_ENABLE_WARP_SYNC_BUILTINS // conflicts with rocWMMA headers #include diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b3e6833c3..b30c13c62 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1227,9 +1227,12 @@ static void ggml_cuda_op_mul_mat_cublas( const int cc = ggml_cuda_info().devices[id].cc; + const bool supports_bf16 = GGML_CUDA_CC_IS_NVIDIA(cc) || GGML_CUDA_CC_IS_AMD(cc) || + (GGML_CUDA_CC_IS_MTHREADS(cc) && cc >= GGML_CUDA_CC_QY2); + const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT; - if (src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { + if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { ggml_cuda_pool_alloc src1_as_bf16(ctx.pool(id)); if (src1->type != GGML_TYPE_BF16) { const to_bf16_cuda_t to_bf16_cuda = ggml_get_to_bf16_cuda(src1->type); @@ -1257,7 +1260,7 @@ static void ggml_cuda_op_mul_mat_cublas( const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); - } else if (((GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) { + } else if (fast_fp16_hardware_available(cc) && use_fp16) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); if (src0->type != GGML_TYPE_F16) { @@ -3061,9 +3064,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; } #ifdef GGML_USE_MUSA - if (b->type == GGML_TYPE_F16 && b->ne[2]*b->ne[3] > 1 && - !ggml_is_transposed(a) && !ggml_is_transposed(b)) { - return false; + const int cc = ggml_cuda_info().devices[dev_ctx->device].cc; + if (b->ne[2]*b->ne[3] > 1 && !ggml_is_transposed(a) && !ggml_is_transposed(b)) { + if (GGML_CUDA_CC_IS_QY1(cc) && op->op == GGML_OP_MUL_MAT && + a->type == GGML_TYPE_F16 && b->type == GGML_TYPE_F16) { + return false; + } + if (GGML_CUDA_CC_IS_QY2(cc) && op->op == GGML_OP_MUL_MAT_ID && + a->type == GGML_TYPE_Q2_K && b->type == GGML_TYPE_F32) { + return false; + } } #endif // GGML_USE_MUSA switch (a->type) { @@ -3090,11 +3100,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_BF16: -#ifdef GGML_USE_MUSA - if (a->type == GGML_TYPE_Q3_K) { - return false; - } -#endif // GGML_USE_MUSA return true; default: return false; diff --git a/ggml/src/ggml-musa/mudnn.cuh b/ggml/src/ggml-musa/mudnn.cuh index a63be5755..c30128561 100644 --- a/ggml/src/ggml-musa/mudnn.cuh +++ b/ggml/src/ggml-musa/mudnn.cuh @@ -1,7 +1,7 @@ #pragma once -#include "../include/ggml.h" -#include "../ggml-cuda/common.cuh" +#include "ggml-cuda/common.cuh" +#include "ggml.h" // Asynchronously copies data from src tensor to dst tensor using the provided context. // Returns a musaError_t indicating success or failure. From bf5bcd0b857db420235e03639f0a5f218a7f8cf8 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Thu, 26 Jun 2025 18:41:41 +0800 Subject: [PATCH 13/54] docs: update s390x documentation + add faq (#14389) * docs: update s390x documentation + add faq Signed-off-by: Aaron Teo * docs: add s390x z17 build q&a Signed-off-by: Aaron Teo --------- Signed-off-by: Aaron Teo --- docs/build-s390x.md | 76 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/docs/build-s390x.md b/docs/build-s390x.md index bb6eae784..4c9ebb271 100644 --- a/docs/build-s390x.md +++ b/docs/build-s390x.md @@ -16,7 +16,7 @@ cd llama.cpp ## CPU Build with BLAS -Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. +Building llama.cpp with BLAS support is highly recommended as it has shown to provide performance improvements. Make sure to have OpenBLAS installed in your environment. ```bash cmake -S . -B build \ @@ -82,12 +82,18 @@ All models need to be converted to Big-Endian. You can achieve this in three cas 1. **Use pre-converted models verified for use on IBM Z & LinuxONE (easiest)** + ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff) + You can find popular models pre-converted and verified at [s390x Ready Models](https://huggingface.co/collections/taronaeo/s390x-ready-models-672765393af438d0ccb72a08). - These models and their respective tokenizers are verified to run correctly on IBM Z & LinuxONE. + These models have already been converted from `safetensors` to `GGUF Big-Endian` and their respective tokenizers verified to run correctly on IBM z15 and later system. 2. **Convert safetensors model to GGUF Big-Endian directly (recommended)** + ![File Type - safetensors](https://img.shields.io/badge/File_Type-safetensors-da1e28) + + The model you are trying to convert must be in `safetensors` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct)). Make sure you have downloaded the model repository for this case. + ```bash python3 convert_hf_to_gguf.py \ --outfile model-name-be.f16.gguf \ @@ -108,6 +114,10 @@ All models need to be converted to Big-Endian. You can achieve this in three cas 3. **Convert existing GGUF Little-Endian model to Big-Endian** + ![File Type - gguf](https://img.shields.io/badge/File_Type-gguf-fff) + + The model you are trying to convert must be in `gguf` file format (for example [IBM Granite 3.3 2B](https://huggingface.co/ibm-granite/granite-3.3-2b-instruct-GGUF)). Make sure you have downloaded the model file for this case. + ```bash python3 gguf-py/gguf/scripts/gguf_convert_endian.py model-name.f16.gguf BIG ``` @@ -163,6 +173,22 @@ It is strongly recommended to disable SMT via the kernel boot parameters as it n IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongly recommended to use BLAS. +## Frequently Asked Questions (FAQ) + +1. I'm getting the following error message while trying to load a model: `gguf_init_from_file_impl: failed to load model: this GGUF file version 50331648 is extremely large, is there a mismatch between the host and model endianness?` + + Answer: Please ensure that the model you have downloaded/converted is GGUFv3 Big-Endian. These models are usually denoted with the `-be` suffix, i.e., `granite-3.3-2b-instruct-be.F16.gguf`. + + You may refer to the [Getting GGUF Models](#getting-gguf-models) section to manually convert a `safetensors` model to `GGUF` Big Endian. + +2. I'm getting extremely poor performance when running inference on a model + + Answer: Please refer to the [Appendix B: SIMD Support Matrix](#appendix-b-simd-support-matrix) to check if your model quantization is supported by SIMD acceleration. + +3. I'm building on IBM z17 and getting the following error messages: `invalid switch -march=z17` + + Answer: Please ensure that your GCC compiler is of minimum GCC 15.1.0 version, and have `binutils` updated to the latest version. If this does not fix the problem, kindly open an issue. + ## Getting Help on IBM Z & LinuxONE 1. **Bugs, Feature Requests** @@ -172,3 +198,49 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl 2. **Other Questions** Please reach out directly to [aionz@us.ibm.com](mailto:aionz@us.ibm.com). + +## Appendix A: Hardware Support Matrix + +| | Support | Minimum Compiler Version | +| ------- | ------- | ------------------------ | +| IBM z15 | ✅ | | +| IBM z16 | ✅ | | +| IBM z17 | ✅ | GCC 15.1.0 | + +- ✅ - supported and verified to run as intended +- 🚫 - unsupported, we are unlikely able to provide support + +## Appendix B: SIMD Support Matrix + +| | VX/VXE/VXE2 | NNPA | zDNN | Spyre | +| ---------- | ----------- | ---- | ---- | ----- | +| FP32 | ✅ | ✅ | ❓ | ❓ | +| FP16 | ✅ | ✅ | ❓ | ❓ | +| BF16 | 🚫 | 🚫 | ❓ | ❓ | +| Q4_0 | ✅ | ✅ | ❓ | ❓ | +| Q4_1 | ✅ | ✅ | ❓ | ❓ | +| Q5_0 | 🚫 | 🚫 | ❓ | ❓ | +| Q5_1 | 🚫 | 🚫 | ❓ | ❓ | +| Q8_0 | ✅ | ✅ | ❓ | ❓ | +| Q2_K | 🚫 | 🚫 | ❓ | ❓ | +| Q3_K | ✅ | ✅ | ❓ | ❓ | +| Q4_K | ✅ | ✅ | ❓ | ❓ | +| Q5_K | ✅ | ✅ | ❓ | ❓ | +| Q6_K | ✅ | ✅ | ❓ | ❓ | +| TQ1_0 | 🚫 | 🚫 | ❓ | ❓ | +| TQ2_0 | 🚫 | 🚫 | ❓ | ❓ | +| IQ2_XXS | 🚫 | 🚫 | ❓ | ❓ | +| IQ2_XS | 🚫 | 🚫 | ❓ | ❓ | +| IQ2_S | 🚫 | 🚫 | ❓ | ❓ | +| IQ3_XXS | 🚫 | 🚫 | ❓ | ❓ | +| IQ3_S | 🚫 | 🚫 | ❓ | ❓ | +| IQ1_S | 🚫 | 🚫 | ❓ | ❓ | +| IQ1_M | 🚫 | 🚫 | ❓ | ❓ | +| IQ4_NL | ✅ | ✅ | ❓ | ❓ | +| IQ4_XS | ✅ | ✅ | ❓ | ❓ | +| FP32->FP16 | 🚫 | ✅ | ❓ | ❓ | +| FP16->FP32 | 🚫 | ✅ | ❓ | ❓ | + +- ✅ - acceleration available +- 🚫 - acceleration unavailable, will still run using scalar implementation +- ❓ - acceleration unknown, please contribute if you can test it yourself From 5783ae43599400b723b5da0569c1f848419ff3c7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Jun 2025 15:50:15 +0300 Subject: [PATCH 14/54] metal : batch rows copy in a single threadgroup (#14384) * metal : batch rows copy in a single threadgroup ggml-ci * metal : handle some edge cases when threadgroup size is not a power of 2 ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 43 ++++++++++++++++++++++++---- ggml/src/ggml-metal/ggml-metal.metal | 11 +++++-- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 19f4d59e5..248fa378e 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -2450,6 +2450,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00); ggml_metal_kargs_sum_rows args = { @@ -3780,6 +3781,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_rms_norm args = { @@ -3816,6 +3818,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_l2_norm args = { @@ -3888,6 +3891,7 @@ static bool ggml_metal_encode_node( nth *= 2; } + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); nth = MIN(nth, ne00/4); ggml_metal_kargs_norm args = { @@ -4974,8 +4978,39 @@ static bool ggml_metal_encode_node( default: GGML_ABORT("not implemented"); } + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + // TODO: support + //const int32_t nk00 = ne00/ggml_blck_size(dst->type); + const int32_t nk00 = ne00; + + int nth = 32; // SIMD width + + while (nth < nk00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) { + nth *= 2; + } + + nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup); + + // when rows are small, we can batch them together in a single threadgroup + int nrptg = 1; + + // TODO: relax this constraint in the future + if (ggml_blck_size(src0->type) == 1 && ggml_blck_size(dst->type) == 1) { + if (nth > nk00) { + nrptg = (nth + nk00 - 1)/nk00; + nth = nk00; + + if (nrptg*nth > (int) pipeline.maxTotalThreadsPerThreadgroup) { + nrptg--; + } + } + } + + nth = MIN(nth, nk00); + ggml_metal_kargs_cpy args = { - /*.ne00 =*/ ne00, + /*.ne00 =*/ nk00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, /*.ne03 =*/ ne03, @@ -4998,11 +5033,7 @@ static bool ggml_metal_encode_node( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); - int nth = MIN(1024, ne00/ggml_blck_size(src0->type)); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; - + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)]; } break; case GGML_OP_SET: { diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 3da19879b..f02827606 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -4306,11 +4306,16 @@ kernel void kernel_cpy( device const char * src0, device char * dst, uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { + ushort3 tptg[[threads_per_threadgroup]]) { const int i03 = tgpig[2]; const int i02 = tgpig[1]; - const int i01 = tgpig[0]; + const int i01 = tgpig[0]*tptg.y + tiitg/tptg.x; + + if (i01 >= args.ne01) { + return; + } const int64_t n = i03*args.ne02*args.ne01*args.ne00 + i02*args.ne01*args.ne00 + i01*args.ne00; @@ -4321,7 +4326,7 @@ kernel void kernel_cpy( device T1 * dst_data = (device T1 *) (dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); - for (int64_t i00 = tpitg.x; i00 < args.ne00; i00 += ntg.x) { + for (int64_t i00 = tiitg%tptg.x; i00 < args.ne00; i00 += tptg.x) { device const T0 * src = (device T0 *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); dst_data[i00] = (T1) src[0]; } From e8215dbb96b8fb94a24c29cdd228166fb972dbfc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Jun 2025 15:51:19 +0300 Subject: [PATCH 15/54] metal : add special-case mat-vec mul for ne00 == 4 (#14385) ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 25 +++++++++-- ggml/src/ggml-metal/ggml-metal.metal | 64 ++++++++++++++++++++++++++++ tests/test-backend-ops.cpp | 64 +++++++++++++++------------- 3 files changed, 121 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 248fa378e..d8d30cc0b 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -211,11 +211,14 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, @@ -1175,11 +1178,14 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, rwkv_wkv6_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, rwkv_wkv7_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4, mul_mv_f32_f32_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, mul_mv_bf16_f32, has_simdgroup_reduction && use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4, mul_mv_bf16_f32_c4, use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, mul_mv_bf16_f32_1row, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4, mul_mv_bf16_f32_l4, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16, mul_mv_bf16_bf16, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4, mul_mv_f16_f32_c4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, has_simdgroup_reduction); @@ -3111,14 +3117,23 @@ static bool ggml_metal_encode_node( nsg = 1; nr0 = 1; nr1 = 4; - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + if (ne00 == 4) { + nr0 = 32; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline; + } } break; case GGML_TYPE_F16: { nsg = 1; nr0 = 1; if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { + if (ne00 == 4) { + nr0 = 32; + nr1 = 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4].pipeline; + } else if (ne11 * ne12 < 4) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline; @@ -3137,7 +3152,11 @@ static bool ggml_metal_encode_node( nsg = 1; nr0 = 1; if (src1t == GGML_TYPE_F32) { - if (ne11 * ne12 < 4) { + if (ne00 == 4) { + nr0 = 32; + nr1 = 4; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4].pipeline; + } else if (ne11 * ne12 < 4) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline; diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index f02827606..5f004a856 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -2532,6 +2532,70 @@ template [[host_name("kernel_mul_mv_bf16_f32")]] kernel mul_mv_t kernel_mul_mv< template [[host_name("kernel_mul_mv_bf16_bf16")]] kernel mul_mv_t kernel_mul_mv; #endif +template +void kernel_mul_mv_c4_impl( + args_t args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig, + ushort tiisg) { + const int r0 = tgpig.x*32 + tiisg; + const int rb = tgpig.y*N_MV_T_T; + const int im = tgpig.z; + + if (r0 >= args.ne01) { + return; + } + + const uint i12 = im%args.ne12; + const uint i13 = im/args.ne12; + + const uint64_t offset0 = r0*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03; + + device const T04 * x = (device const T04 *) (src0 + offset0); + + device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1; + + for (int row = 0; row < N_MV_T_T; ++row) { + int r1 = rb + row; + if (r1 >= args.ne11) { + break; + } + + const uint64_t offset1 = r1*args.nb11 + (i12 )*args.nb12 + (i13 )*args.nb13; + + device const T14 * y = (device const T14 *) (src1 + offset1); + + dst_f32[(uint64_t)r1*args.ne0 + r0] = dot((float4) x[0], (float4) y[0]); + } +} + +template +kernel void kernel_mul_mv_c4( + constant ggml_metal_kargs_mul_mv & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort tiisg[[thread_index_in_simdgroup]]) { + kernel_mul_mv_c4_impl( + args, + src0, + src1, + dst, + tgpig, + tiisg); +} + +typedef decltype(kernel_mul_mv_c4) mul_mv_c4_t; + +template [[host_name("kernel_mul_mv_f32_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +template [[host_name("kernel_mul_mv_f16_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +#if defined(GGML_METAL_USE_BF16) +template [[host_name("kernel_mul_mv_bf16_f32_c4")]] kernel mul_mv_c4_t kernel_mul_mv_c4; +#endif + template kernel void kernel_mul_mv_1row( constant ggml_metal_kargs_mul_mv & args, diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7be7f2205..615c2dc00 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4252,39 +4252,45 @@ static std::vector> make_test_cases_eval() { #if 1 for (ggml_type type_a : base_types) { for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) { - // test cases without permutation - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {2, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {1, 1}, {1, 2})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 1}, {2, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 2}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 2}, {2, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 2}, {1, 2})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {3, 2}, {2, 2})); + std::vector ks = { 256 }; + if (ggml_blck_size(type_a) == 1) { + ks.push_back(4); + } + for (auto k : ks) { + // test cases without permutation + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {1, 1}, {2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {1, 1}, {1, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 1}, {2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {1, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {2, 2})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {2, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {1, 1}, {1, 2})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 1}, {2, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {1, 2})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {3, 2}, {2, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 1}, {2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {1, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {3, 2}, {2, 2})); - // test cases with permutation - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + // test cases with permutation + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {2, 3}, {1, 1}, {0, 3, 2, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 3, 2, 1})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2})); - test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 1, 3, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {2, 3}, {1, 1}, {0, 3, 2, 1})); + } // test cases with large ne00/ne10 to cover stream-k fixup test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 1024, {3, 2}, {1, 1})); From b25346221dadb9101aa9dda55431dde4d3596943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 26 Jun 2025 15:01:14 +0200 Subject: [PATCH 16/54] llama : return mistral-v7-tekken as default template only (#14390) --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9b19da984..c2835ce67 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14377,7 +14377,7 @@ const char * llama_model_chat_template(const llama_model * model, const char * n // do not extend this list unless absolutely necessary // Mistral-Small-2503 does not have built-in chat template llama_vocab_pre_type pre_type = model->vocab.get_pre_type(); - if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) { + if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) { return "mistral-v7-tekken"; } From a01047b041aa04aeea351933658433ed004516ab Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Thu, 26 Jun 2025 13:46:53 -0300 Subject: [PATCH 17/54] cmake: regen vulkan shaders when shaders-gen sources change (#14398) * Add shaders-gen sources as target deps --- ggml/src/ggml-vulkan/CMakeLists.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 39f022f33..0bf4cb14f 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -143,7 +143,8 @@ if (Vulkan_FOUND) -DCMAKE_BUILD_TYPE=$ ${VULKAN_SHADER_GEN_CMAKE_ARGS} - BUILD_COMMAND ${CMAKE_COMMAND} --build . --config $ + BUILD_COMMAND ${CMAKE_COMMAND} --build . --config $ + BUILD_ALWAYS TRUE # NOTE: When DESTDIR is set using Makefile generators and # "make install" triggers the build step, vulkan-shaders-gen @@ -164,6 +165,14 @@ if (Vulkan_FOUND) file(GLOB _ggml_vk_shader_files CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.comp") + # Because external projects do not provide source-level tracking, + # the vulkan-shaders-gen sources need to be explicitly added to + # ensure that changes will cascade into shader re-generation. + + file(GLOB _ggml_vk_shaders_gen_sources + CONFIGURE_DEPENDS "${_ggml_vk_input_dir}/*.cpp" + "${_ggml_vk_input_dir}/*.h") + add_custom_command( OUTPUT ${_ggml_vk_header} ${_ggml_vk_source} @@ -177,6 +186,7 @@ if (Vulkan_FOUND) --no-clean DEPENDS ${_ggml_vk_shader_files} + ${_ggml_vk_shaders_gen_sources} vulkan-shaders-gen COMMENT "Generate vulkan shaders" From 8846aace4934ad29651ea61b8c7e3f6b0556e3d2 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 26 Jun 2025 19:34:02 +0200 Subject: [PATCH 18/54] model : gemma3n text-only (#14400) * gemma3n * add llm_graph_input_one --- convert_hf_to_gguf.py | 124 +++++++- gguf-py/gguf/constants.py | 75 +++++ gguf-py/gguf/gguf_writer.py | 18 ++ gguf-py/gguf/tensor_mapping.py | 64 ++++ src/llama-arch.cpp | 54 ++++ src/llama-arch.h | 17 ++ src/llama-graph.cpp | 23 +- src/llama-graph.h | 16 +- src/llama-hparams.h | 6 + src/llama-kv-cache-unified.cpp | 30 +- src/llama-model.cpp | 517 +++++++++++++++++++++++++++++++++ src/llama-model.h | 22 ++ src/llama-quant.cpp | 9 +- 13 files changed, 960 insertions(+), 15 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bbf8b30ff..4f2339a02 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -310,6 +310,8 @@ class ModelBase: gguf.MODEL_TENSOR.POSNET_NORM2, gguf.MODEL_TENSOR.V_ENC_EMBD_POS, gguf.MODEL_TENSOR.A_ENC_EMBD_POS, + gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, + gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, ) ) or not new_name.endswith(".weight") @@ -320,7 +322,11 @@ class ModelBase: self.match_model_tensor_name(new_name, key, bid) for key in ( gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, gguf.MODEL_TENSOR.OUTPUT, + gguf.MODEL_TENSOR.ALTUP_ROUTER, + gguf.MODEL_TENSOR.LAUREL_L, + gguf.MODEL_TENSOR.LAUREL_R, ) ): if self.ftype in ( @@ -921,13 +927,16 @@ class TextModel(ModelBase): tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) - vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + vocab_size = self.find_hparam([ + "vocab_size_per_layer_input", # gemma3n + "vocab_size", + ], optional=True) or tokenizer.vocab_size() tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - for token_id in range(tokenizer.vocab_size()): + for token_id in range(vocab_size): piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) @@ -942,6 +951,10 @@ class TextModel(ModelBase): elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE + if token_id >= vocab_size: + logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') + break + tokens[token_id] = text scores[token_id] = score toktypes[token_id] = toktype @@ -4217,6 +4230,7 @@ class Gemma2Model(TextModel): @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") class Gemma3Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA3 + norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value def set_vocab(self): self._set_vocab_sentencepiece() @@ -4238,9 +4252,8 @@ class Gemma3Model(TextModel): self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers - # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3 + # attn_logit_softcapping is removed in Gemma3 assert hparams.get("attn_logit_softcapping") is None - assert hparams.get("final_logit_softcapping") is None self.gguf_writer.add_sliding_window(hparams["sliding_window"]) self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) if hparams.get("rope_scaling") is not None: @@ -4252,7 +4265,7 @@ class Gemma3Model(TextModel): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if name.startswith("language_model."): + if "language_model." in name: name = name.replace("language_model.", "") elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ @@ -4267,8 +4280,9 @@ class Gemma3Model(TextModel): # ref code in Gemma3RMSNorm # output = output * (1.0 + self.weight.float()) + # note: this is not the case on gemma3n if name.endswith("norm.weight"): - data_torch = data_torch + 1 + data_torch = data_torch + self.norm_shift return [(self.map_tensor_name(name), data_torch)] @@ -4325,6 +4339,104 @@ class Gemma3VisionModel(MmprojModel): return [] # skip other tensors +@ModelBase.register("Gemma3nForConditionalGeneration") +class Gemma3NModel(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA3N + norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code + + _altup_proj: list[Tensor] = [] + _altup_unembd: list[Tensor] = [] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs" + self._altup_proj = [ + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + ] + self._altup_unembd = [ + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + torch.Tensor(), # to be replaced + ] + + def set_vocab(self): + with open(self.dir_model / "chat_template.jinja") as f: + # quick hack to make sure chat template is added + self.gguf_writer.add_chat_template(f.read()) + super().set_vocab() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"]) + self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"]) + self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"]) + self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"]) + + activation_sparsity_scale = [] + for s in self.hparams["activation_sparsity_pattern"]: + normal_dist = torch.distributions.normal.Normal(0, 1) + std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32)) + activation_sparsity_scale.append(std_multiplier.item()) + self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale) + + sliding_window_pattern = [] + for t in self.hparams["layer_types"]: + sliding_window_pattern.append(t == "sliding_attention") + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + + def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None: + has_all = all(m.numel() > 0 for m in matrices) + if not has_all: + return None + else: + return torch.stack(matrices, dim=0) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith("_scale"): + name = name + ".weight" + + # TODO: implement self.prediction_coefs.weight.clamp_(...) + + if "language_model." not in name: + return [] # skip non-language model tensors + + if "altup_unembed_projections" in name: + data_torch = data_torch.to(device="cpu") + if ".0." in name: + self._altup_unembd[0] = data_torch + elif ".1." in name: + self._altup_unembd[1] = data_torch + elif ".2." in name: + self._altup_unembd[2] = data_torch + else: + raise ValueError(f"Unknown name: {name}") + out = self._stack_matrices(self._altup_unembd) + if out is not None: + return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)] + else: + return [] + + if "altup_projections" in name: + data_torch = data_torch.to(device="cpu") + if ".0." in name: + self._altup_proj[0] = data_torch + elif ".1." in name: + self._altup_proj[1] = data_torch + elif ".2." in name: + self._altup_proj[2] = data_torch + else: + raise ValueError(f"Unknown name: {name}") + out = self._stack_matrices(self._altup_proj) + if out is not None: + return [(self.map_tensor_name("model.altup_projections.weight"), out)] + else: + return [] + + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Starcoder2ForCausalLM") class StarCoder2Model(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0429b0aaf..fb75143b0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -118,6 +118,10 @@ class Keys: EMBEDDING_SCALE = "{arch}.embedding_scale" TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step" + ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale" + ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" + ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" + EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -142,6 +146,8 @@ class Keys: SCALE = "{arch}.attention.scale" KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" + SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" + SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -314,6 +320,7 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() GEMMA2 = auto() GEMMA3 = auto() + GEMMA3N = auto() STARCODER2 = auto() RWKV6 = auto() RWKV6QWEN2 = auto() @@ -399,6 +406,22 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() + PER_LAYER_TOKEN_EMBD = auto() # gemma3n + PER_LAYER_MODEL_PROJ = auto() # gemma3n + PER_LAYER_INP_GATE = auto() # gemma3n + PER_LAYER_PROJ = auto() # gemma3n + PER_LAYER_PROJ_NORM = auto() # gemma3n + PER_LAYER_POST_NORM = auto() # gemma3n + ALTUP_PROJ = auto() # gemma3n + ALTUP_UNEMBD_PROJ = auto() # gemma3n + ALTUP_CORRECT_COEF = auto() # gemma3n + ALTUP_CORRECT_SCALE = auto() # gemma3n + ALTUP_PREDICT_COEF = auto() # gemma3n + ALTUP_ROUTER = auto() # gemma3n + ALTUP_ROUTER_NORM = auto() # gemma3n + LAUREL_L = auto() # gemma3n + LAUREL_R = auto() # gemma3n + LAUREL_POST_NORM = auto() # gemma3n SSM_IN = auto() SSM_CONV1D = auto() SSM_X = auto() @@ -597,6 +620,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.GEMMA2: "gemma2", MODEL_ARCH.GEMMA3: "gemma3", + MODEL_ARCH.GEMMA3N: "gemma3n", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", @@ -682,6 +706,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n + MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n + MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n + MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n + MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n + MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n + MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n + MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n + MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n + MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n + MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n + MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n + MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n + MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n + MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n + MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", @@ -1486,6 +1526,41 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_PRE_NORM, MODEL_TENSOR.FFN_POST_NORM, ], + MODEL_ARCH.GEMMA3N: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + # altup / laurel + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD, + MODEL_TENSOR.PER_LAYER_MODEL_PROJ, + MODEL_TENSOR.PER_LAYER_INP_GATE, + MODEL_TENSOR.PER_LAYER_PROJ, + MODEL_TENSOR.PER_LAYER_PROJ_NORM, + MODEL_TENSOR.PER_LAYER_POST_NORM, + MODEL_TENSOR.ALTUP_PROJ, + MODEL_TENSOR.ALTUP_UNEMBD_PROJ, + MODEL_TENSOR.ALTUP_CORRECT_COEF, + MODEL_TENSOR.ALTUP_CORRECT_SCALE, + MODEL_TENSOR.ALTUP_PREDICT_COEF, + MODEL_TENSOR.ALTUP_ROUTER, + MODEL_TENSOR.ALTUP_ROUTER_NORM, + MODEL_TENSOR.LAUREL_L, + MODEL_TENSOR.LAUREL_R, + MODEL_TENSOR.LAUREL_POST_NORM, + ], MODEL_ARCH.STARCODER2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index b9b63d052..d32cd479a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -672,6 +672,18 @@ class GGUFWriter: def add_decoder_start_token_id(self, id: int) -> None: self.add_uint32(Keys.LLM.DECODER_START_TOKEN_ID.format(arch=self.arch), id) + def add_embedding_length_per_layer_input(self, value: int) -> None: + self.add_uint32(Keys.LLM.EMBD_LENGTH_PER_LAYER_INP.format(arch=self.arch), value) + + def add_altup_active_idx(self, val: int) -> None: + self.add_uint32(Keys.LLM.ALTUP_ACTIVE_IDX.format(arch=self.arch), val) + + def add_altup_num_inputs(self, val: int) -> None: + self.add_uint32(Keys.LLM.ALTUP_NUM_INPUTS.format(arch=self.arch), val) + + def add_activation_sparsity_scale(self, values: Sequence[float]) -> None: + self.add_array(Keys.LLM.ACTIVATION_SPARSITY_SCALE.format(arch=self.arch), values) + def add_head_count(self, count: int | Sequence[int]) -> None: if isinstance(count, int): self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) @@ -702,6 +714,12 @@ class GGUFWriter: def add_clamp_kqv(self, value: float) -> None: self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value) + def add_shared_kv_layers(self, value: float) -> None: + self.add_float32(Keys.Attention.SHARED_KV_LAYERS.format(arch=self.arch), value) + + def add_sliding_window_pattern(self, value: Sequence[bool]) -> None: + self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value) + def add_logit_scale(self, value: float) -> None: self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 79f044d2a..b30f77dbe 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -480,6 +480,70 @@ class TensorNameMap: "encoder.layer.{bid}.layer_norm_2" # jina-v2-code ), + MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: ( + "model.embed_tokens_per_layer", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_MODEL_PROJ: ( + "model.per_layer_model_projection", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_PROJ_NORM: ( + "model.per_layer_projection_norm", # gemma3n + ), + + MODEL_TENSOR.ALTUP_PROJ: ( + "model.altup_projections", # gemma3n + ), + + MODEL_TENSOR.ALTUP_UNEMBD_PROJ: ( + "model.altup_unembed_projections", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_INP_GATE: ( + "model.layers.{bid}.per_layer_input_gate", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_PROJ: ( + "model.layers.{bid}.per_layer_projection", # gemma3n + ), + + MODEL_TENSOR.PER_LAYER_POST_NORM: ( + "model.layers.{bid}.post_per_layer_input_norm", # gemma3n + ), + + MODEL_TENSOR.ALTUP_CORRECT_COEF: ( + "model.layers.{bid}.altup.correction_coefs", # gemma3n + ), + + MODEL_TENSOR.ALTUP_CORRECT_SCALE: ( + "model.layers.{bid}.altup.correct_output_scale", # gemma3n + ), + + MODEL_TENSOR.ALTUP_PREDICT_COEF: ( + "model.layers.{bid}.altup.prediction_coefs", # gemma3n + ), + + MODEL_TENSOR.ALTUP_ROUTER: ( + "model.layers.{bid}.altup.modality_router", # gemma3n + ), + + MODEL_TENSOR.ALTUP_ROUTER_NORM: ( + "model.layers.{bid}.altup.router_norm", # gemma3n + ), + + MODEL_TENSOR.LAUREL_L: ( + "model.layers.{bid}.laurel.linear_left", # gemma3n + ), + + MODEL_TENSOR.LAUREL_R: ( + "model.layers.{bid}.laurel.linear_right", # gemma3n + ), + + MODEL_TENSOR.LAUREL_POST_NORM: ( + "model.layers.{bid}.laurel.post_laurel_norm", # gemma3n + ), + MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 8dadef204..435e3b9ba 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -42,6 +42,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_GEMMA2, "gemma2" }, { LLM_ARCH_GEMMA3, "gemma3" }, + { LLM_ARCH_GEMMA3N, "gemma3n" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_XVERSE, "xverse" }, @@ -932,6 +933,42 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, }, }, + { + LLM_ARCH_GEMMA3N, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" }, + { LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" }, + { LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" }, + { LLM_TENSOR_ALTUP_UNEMBD_PROJ, "altup_unembd_proj" }, + { LLM_TENSOR_ALTUP_PROJ, "altup_proj" }, + { LLM_TENSOR_PER_LAYER_INP_GATE, "blk.%d.inp_gate" }, + { LLM_TENSOR_PER_LAYER_PROJ, "blk.%d.proj" }, + { LLM_TENSOR_PER_LAYER_POST_NORM, "blk.%d.post_norm" }, + { LLM_TENSOR_ALTUP_CORRECT_COEF, "blk.%d.altup_correct_coef" }, + { LLM_TENSOR_ALTUP_CORRECT_SCALE, "blk.%d.altup_correct_scale" }, + { LLM_TENSOR_ALTUP_PREDICT_COEF, "blk.%d.altup_predict_coef" }, + { LLM_TENSOR_ALTUP_ROUTER, "blk.%d.altup_router" }, + { LLM_TENSOR_ALTUP_ROUTER_NORM, "blk.%d.altup_router_norm" }, + { LLM_TENSOR_LAUREL_L, "blk.%d.laurel_l" }, + { LLM_TENSOR_LAUREL_R, "blk.%d.laurel_r" }, + { LLM_TENSOR_LAUREL_POST_NORM, "blk.%d.laurel_post_norm" }, + }, + }, { LLM_ARCH_STARCODER2, { @@ -1749,6 +1786,23 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + // altup / laurel (gemma 3n) + {LLM_TENSOR_PER_LAYER_TOKEN_EMBD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_PER_LAYER_MODEL_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_PER_LAYER_PROJ_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_ALTUP_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ALTUP_UNEMBD_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_PER_LAYER_INP_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_PER_LAYER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_PER_LAYER_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ALTUP_CORRECT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ALTUP_CORRECT_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ALTUP_PREDICT_COEF, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ALTUP_ROUTER, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ALTUP_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_LAUREL_L, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_LAUREL_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_LAUREL_POST_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 5b0230c15..9181ad053 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -46,6 +46,7 @@ enum llm_arch { LLM_ARCH_GEMMA, LLM_ARCH_GEMMA2, LLM_ARCH_GEMMA3, + LLM_ARCH_GEMMA3N, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, @@ -269,6 +270,22 @@ enum llm_tensor { LLM_TENSOR_LAYER_OUT_NORM, LLM_TENSOR_POST_ATTN_NORM, LLM_TENSOR_POST_MLP_NORM, + LLM_TENSOR_PER_LAYER_TOKEN_EMBD, // gemma3n + LLM_TENSOR_PER_LAYER_MODEL_PROJ, // gemma3n + LLM_TENSOR_PER_LAYER_INP_GATE, // gemma3n + LLM_TENSOR_PER_LAYER_PROJ, // gemma3n + LLM_TENSOR_PER_LAYER_PROJ_NORM, // gemma3n + LLM_TENSOR_PER_LAYER_POST_NORM, // gemma3n + LLM_TENSOR_ALTUP_PROJ, // gemma3n + LLM_TENSOR_ALTUP_UNEMBD_PROJ, // gemma3n + LLM_TENSOR_ALTUP_CORRECT_COEF, // gemma3n + LLM_TENSOR_ALTUP_CORRECT_SCALE, // gemma3n + LLM_TENSOR_ALTUP_PREDICT_COEF, // gemma3n + LLM_TENSOR_ALTUP_ROUTER, // gemma3n + LLM_TENSOR_ALTUP_ROUTER_NORM, // gemma3n + LLM_TENSOR_LAUREL_L, // gemma3n + LLM_TENSOR_LAUREL_R, // gemma3n + LLM_TENSOR_LAUREL_POST_NORM, // gemma3n LLM_TENSOR_SSM_IN, LLM_TENSOR_SSM_CONV1D, LLM_TENSOR_SSM_X, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 48589a50a..71ee431a9 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -350,6 +350,12 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) { } } +void llm_graph_input_one::set_input(const llama_ubatch *) { + GGML_ASSERT(one && ggml_nelements(one) == 1); + float f_one = 1.0f; + ggml_backend_tensor_set(one, &f_one, 0, sizeof(float)); +} + // // llm_graph_context // @@ -1267,8 +1273,14 @@ ggml_tensor * llm_graph_context::build_attn( // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced ggml_build_forward_expand(gf, q_cur); - ggml_build_forward_expand(gf, k_cur); - ggml_build_forward_expand(gf, v_cur); + + if (k_cur) { + ggml_build_forward_expand(gf, k_cur); + } + + if (v_cur) { + ggml_build_forward_expand(gf, v_cur); + } const auto * mctx_iswa = static_cast(mctx); @@ -1276,9 +1288,12 @@ ggml_tensor * llm_graph_context::build_attn( const auto * mctx_cur = is_swa ? mctx_iswa->get_swa() : mctx_iswa->get_base(); - // store to KV cache - { + // optionally store to KV cache + if (k_cur) { ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il)); + } + + if (v_cur) { ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il)); } diff --git a/src/llama-graph.h b/src/llama-graph.h index b433f266d..4b1ec354d 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -329,6 +329,17 @@ public: const llama_memory_hybrid_context * mctx; }; +// TODO: remove this when ggml_scale_add is implemented +class llm_graph_input_one : public llm_graph_input_i { +public: + llm_graph_input_one() {} + virtual ~llm_graph_input_one() = default; + + void set_input(const llama_ubatch *) override; + + ggml_tensor * one = nullptr; // F32 +}; + // // llm_graph_result // @@ -589,14 +600,15 @@ struct llm_graph_context { llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const; + // note: if k_cur or v_cur are not provided, they will not be stored in the memory ggml_tensor * build_attn( llm_graph_input_attn_kv_unified_iswa * inp, ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] - ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] - ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional + ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional ggml_tensor * kq_b, ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 7b315a9a7..e85afe145 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -143,6 +143,12 @@ struct llama_hparams { uint32_t n_attn_temp_floor_scale = 8192; float f_attn_temp_scale = 0.1; + // gemma3n altup + uint32_t n_altup = 4; // altup_num_inputs + uint32_t i_altup_act = 0; // altup_active_idx + uint32_t laurel_rank = 64; + uint32_t n_embd_altup = 256; + // needed by encoder-decoder models (e.g. T5, FLAN-T5) // ref: https://github.com/ggerganov/llama.cpp/pull/8141 llama_token dec_start_token_id = LLAMA_TOKEN_NULL; diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index b506d32ed..8517b722a 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -33,13 +33,19 @@ llama_kv_cache_unified::llama_kv_cache_unified( GGML_ASSERT(kv_size % n_pad == 0); + // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] + auto n_layer_cache = hparams.n_layer; + if (model.arch == LLM_ARCH_GEMMA3N) { + n_layer_cache = 20; + } + // create a context for each buffer type std::map ctx_map; auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(2u*n_layer_cache*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -62,7 +68,7 @@ llama_kv_cache_unified::llama_kv_cache_unified( cells.resize(kv_size); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + for (uint32_t il = 0; il < n_layer_cache; il++) { if (filter && !filter(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); continue; @@ -102,6 +108,26 @@ llama_kv_cache_unified::llama_kv_cache_unified( layers.push_back({ il, k, v }); } + // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] + if (model.arch == LLM_ARCH_GEMMA3N) { + LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1); + + for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) { + if (filter && !filter(il)) { + LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); + continue; + } + + const bool is_swa = hparams.is_swa(il); + const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1); + + GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end()); + map_layer_ids[il] = map_layer_ids[il_reuse]; + + LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa); + } + } + // allocate tensors and initialize the buffers to avoid NaNs in the padding for (auto it : ctx_map) { auto * buft = it.first; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c2835ce67..fc39195ed 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -103,6 +103,8 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; case LLM_TYPE_30B_A3B: return "30B.A3B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; + case LLM_TYPE_E2B: return "E2B"; + case LLM_TYPE_E4B: return "E4B"; default: return "?B"; } } @@ -1017,6 +1019,24 @@ void llama_model::load_hparams(llama_model_loader & ml) { ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0))) : 1.0f / std::sqrt(float(hparams.n_embd_head_k)); } break; + case LLM_ARCH_GEMMA3N: + { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + hparams.set_swa_pattern(5); + + hparams.rope_freq_base_train_swa = 10000.0f; + hparams.rope_freq_scale_train_swa = 1.0f; + hparams.f_attention_scale = 1.0f; + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 30: type = LLM_TYPE_E2B; break; + case 35: type = LLM_TYPE_E4B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_STARCODER2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -2950,6 +2970,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); } } break; + case LLM_ARCH_GEMMA3N: + { + const int64_t n_altup = hparams.n_altup; + const int64_t laurel_rank = hparams.laurel_rank; + const int64_t n_embd_altup = hparams.n_embd_altup; + + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + tok_embd_per_layer = create_tensor(tn(LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "weight"), {n_embd_altup * n_layer, n_vocab}, 0); + + altup_proj = create_tensor(tn(LLM_TENSOR_ALTUP_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0); + altup_unembd_proj = create_tensor(tn(LLM_TENSOR_ALTUP_UNEMBD_PROJ, "weight"), {n_embd, n_embd, n_altup - 1}, 0); + per_layer_model_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_MODEL_PROJ, "weight"), {n_embd, n_embd_altup * n_layer}, 0); + per_layer_proj_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ_NORM, "weight"), {n_embd_altup}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + + // altup & laurel + layer.per_layer_inp_gate = create_tensor(tn(LLM_TENSOR_PER_LAYER_INP_GATE, "weight", i), {n_embd, n_embd_altup}, 0); + layer.per_layer_proj = create_tensor(tn(LLM_TENSOR_PER_LAYER_PROJ, "weight", i), {n_embd_altup, n_embd}, 0); + layer.per_layer_post_norm = create_tensor(tn(LLM_TENSOR_PER_LAYER_POST_NORM, "weight", i), {n_embd}, 0); + layer.altup_correct_coef = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_COEF, "weight", i), {n_altup, n_altup}, 0); + layer.altup_correct_scale = create_tensor(tn(LLM_TENSOR_ALTUP_CORRECT_SCALE, "weight", i), {n_embd}, 0); + layer.altup_predict_coef = create_tensor(tn(LLM_TENSOR_ALTUP_PREDICT_COEF, "weight", i), {n_altup, n_altup * n_altup}, 0); + layer.altup_router = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER, "weight", i), {n_embd, n_altup}, 0); + layer.altup_router_norm = create_tensor(tn(LLM_TENSOR_ALTUP_ROUTER_NORM, "weight", i), {n_embd}, 0); + layer.laurel_l = create_tensor(tn(LLM_TENSOR_LAUREL_L, "weight", i), {n_embd, laurel_rank}, 0); + layer.laurel_r = create_tensor(tn(LLM_TENSOR_LAUREL_R, "weight", i), {laurel_rank, n_embd}, 0); + layer.laurel_post_norm = create_tensor(tn(LLM_TENSOR_LAUREL_POST_NORM, "weight", i), {n_embd}, 0); + } + } break; case LLM_ARCH_STARCODER2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8980,6 +9056,442 @@ struct llm_build_gemma3_iswa : public llm_graph_context { } }; +struct llm_build_gemma3n_iswa : public llm_graph_context { + const llama_model & model; + ggml_cgraph * gf; + + const int64_t n_embd_head; + const int64_t n_embd_altup; + const int64_t n_altup; + const int i_altup_act; + const int n_layer_kv = 20; // number of layers having KV [KV_REUSE] + const int n_layer_sparsity = 10; // number of layers using activation sparsity + const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) + + ggml_tensor * one; // containing single element 1.0f + + llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) + : llm_graph_context(params), + model(model), + gf(gf), + n_embd_head(model.hparams.n_embd_head_k), + n_embd_altup(model.hparams.n_embd_altup), + n_altup(model.hparams.n_altup), + i_altup_act(model.hparams.i_altup_act) { + ggml_tensor * cur; + ggml_tensor * inpL; + + // TODO: remove this when ggml_scale_add is implemented + one = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + { + auto inp = std::make_unique(); + inp->one = one; + res->add_input(std::move(inp)); + } + + inpL = build_inp_embd(model.tok_embd); + + // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) + if (ubatch.token) { + inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd)); + cb(inpL, "inp_scaled", -1); + } + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // TODO: is causal == true correct? might need some changes + auto * inp_attn = build_attn_inp_kv_unified_iswa(); + + // inp_per_layer shape: [n_embd_altup, n_tokens, n_layer] + ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs()); + + // inpL now has only 1 altup, project it to the rest of the altups + // these "added" altups will be concat to the last dim of inpL + { + ggml_tensor * target_magnitude = calc_magnitude(inpL); + ggml_tensor * inp_repeated = ggml_repeat_4d(ctx0, inpL, n_embd, n_tokens, n_altup - 1, 1); + ggml_tensor * altup_added = ggml_mul_mat(ctx0, model.altup_proj, inp_repeated); // shape: [n_embd, n_tokens, n_altup - 1] + ggml_tensor * new_magnitude = calc_magnitude(altup_added); + altup_added = ggml_div(ctx0, + ggml_mul(ctx0, altup_added, target_magnitude), + new_magnitude); + inpL = ggml_concat(ctx0, inpL, altup_added, 2); // shape: [n_embd, n_tokens, n_altup] + cb(inpL, "inp_stacked", -1); + } + + // inpL now has shape: [n_embd, n_tokens, n_altup] + // inp_per_layer now has shape: [n_embd_altup, n_tokens, n_layer] + + for (int il = 0; il < n_layer; ++il) { + // this block is made to be closely resemble Gemma3p5DecoderLayer on python code + const bool has_kv = (il < n_layer_kv); + + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + + ggml_tensor * cur = inpL; // [n_embd, n_tokens, n_altup] + ggml_tensor * predictions = altup_predict(cur, il); // [n_embd, n_tokens, n_altup] + + // predicted value will go through self-attention and laurel + ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); // [n_embd, n_tokens] + cur = active_prediction; + cb(cur, "active_prediction", il); + + // norm + cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // laurel + ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens] + + // self-attention + if (has_kv) { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + Vcur = ggml_rms_norm(ctx0, Vcur, hparams.f_norm_rms_eps); + + cb(Qcur, "Qcur_normed", il); + cb(Kcur, "Kcur_normed", il); + cb(Vcur, "Vcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur_pos", il); + cb(Kcur, "Kcur_pos", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il); + } else { + // no KV layers + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_pos", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + Qcur, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); + } + + cur = build_norm(cur, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, active_prediction); // [n_embd, n_tokens] + cb(cur, "attn_gated", il); + + ggml_tensor * attn_laurel = ggml_scale(ctx0, + ggml_add(ctx0, cur, laurel_out), + 1.0f / sqrtf(2.0f)); // [n_embd, n_tokens] + cb(attn_laurel, "attn_laurel", il); + + cur = build_norm(attn_laurel, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward network + { + ggml_tensor * up_proj = build_lora_mm(model.layers[il].ffn_up, cur); + ggml_tensor * gate_proj = build_lora_mm(model.layers[il].ffn_gate, cur); + + if (il < n_layer_sparsity) { + // apply activation sparsity + gate_proj = gaussian_topk(gate_proj); + } + gate_proj = ggml_gelu(ctx0, gate_proj); + + cur = ggml_mul(ctx0, up_proj, gate_proj); + cur = build_lora_mm(model.layers[il].ffn_down, cur); + cb(cur, "ffn_out", il); + } + + cur = build_norm(cur, + model.layers[il].ffn_post_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", il); + + ggml_tensor * attn_ffw_laurel_gated = ggml_add(ctx0, cur, attn_laurel); // [n_embd, n_tokens] + cb(attn_ffw_laurel_gated, "attn_ffw_laurel_gated", il); + + ggml_tensor * corrected = altup_correct(predictions, attn_ffw_laurel_gated, il); // [n_embd, n_tokens, n_altup] + + ggml_tensor * first_prediction; // [n_embd, n_tokens] + { + first_prediction = view_2d_slice(corrected, i_altup_act); // [n_embd, n_tokens] + first_prediction = ggml_mul(ctx0, first_prediction, model.layers[il].altup_correct_scale); + first_prediction = build_lora_mm(model.layers[il].per_layer_inp_gate, first_prediction); + first_prediction = ggml_gelu(ctx0, first_prediction); // [n_embd_altup, n_tokens] + cb(first_prediction, "first_prediction_gated", il); + ggml_tensor * inp_this_layer = view_2d_slice(inp_per_layer, il); // [n_embd_altup, n_tokens] + first_prediction = ggml_mul(ctx0, first_prediction, inp_this_layer); // [n_embd_altup, n_tokens] + cb(first_prediction, "first_prediction_scaled", il); + + first_prediction = build_lora_mm(model.layers[il].per_layer_proj, first_prediction); // [n_embd, n_tokens] + first_prediction = build_norm(first_prediction, + model.layers[il].per_layer_post_norm, NULL, + LLM_NORM_RMS, il); + cb(first_prediction, "first_prediction_out", il); + } + + // equivalent to python code: corrected_predictions[1:] += first_prediction + { + ggml_tensor * slice_first = view_2d_slice(corrected, 0); + ggml_tensor * slice_rest = ggml_view_3d(ctx0, corrected, n_embd, n_tokens, n_altup - 1, + ggml_row_size(corrected->type, n_embd), + ggml_row_size(corrected->type, n_embd*n_tokens), + n_embd*n_tokens*ggml_element_size(corrected)); + ggml_tensor * tmp = ggml_add(ctx0, slice_rest, first_prediction); // [n_embd, n_tokens, n_altup - 1] + corrected = ggml_concat(ctx0, slice_first, tmp, 2); // [n_embd, n_tokens, n_altup] + } + + cur = corrected; // [n_embd, n_tokens, n_altup] + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; // [n_embd, n_tokens, n_altup] + + // cur now has multiple altup(s), we want to merge them back to 1 altup + { + ggml_tensor * target_magnitude = calc_magnitude(view_2d_slice(cur, i_altup_act)); // [n_embd, n_tokens] + // do a view to skip the first slice (active altup) + ggml_tensor * alt_slice = ggml_view_3d(ctx0, cur, n_embd, n_tokens, n_altup - 1, + ggml_row_size(cur->type, n_embd), + ggml_row_size(cur->type, n_embd*n_tokens), + n_embd*n_tokens*ggml_element_size(cur)); + ggml_tensor * altup_unembd = ggml_mul_mat(ctx0, model.altup_unembd_proj, alt_slice); // shape: [n_embd, n_tokens, n_altup - 1] + ggml_tensor * new_magnitude = calc_magnitude(altup_unembd); + altup_unembd = ggml_div(ctx0, + ggml_mul(ctx0, altup_unembd, target_magnitude), + new_magnitude); + cb(altup_unembd, "altup_unembd", -1); + + // equivalent to torch.mean(hidden_states, dim=0) + cur = view_2d_slice(cur, 0); // [n_embd, n_tokens] + for (int i = 0; i < n_altup - 1; ++i) { + cur = ggml_add(ctx0, cur, view_2d_slice(altup_unembd, i)); + } + cur = ggml_scale(ctx0, cur, 1.0f / float(n_altup)); // [n_embd, n_tokens] + cb(cur, "unembd_merged", -1); + } + + // cur now has shape: [n_embd, n_tokens] + + // TODO: move this to right after the last KV layer + { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + + { + // final logit soft-capping + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); + cur = ggml_tanh(ctx0, cur); + cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + + ggml_tensor * calc_magnitude(ggml_tensor * x) { + return ggml_sqrt(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, x))); + } + + // get 2D slice view from a 3D tensor, the idx corresponds to the 3rd dim + ggml_tensor * view_2d_slice(ggml_tensor * x, int idx) { + GGML_ASSERT(idx < (int)x->ne[2]); + return ggml_view_2d(ctx0, x, x->ne[0], x->ne[1], + ggml_row_size(x->type, x->ne[0]), + idx * x->ne[0] * x->ne[1] * ggml_element_size(x)); + } + + // equivalent to get_per_layer_inputs() in python code + // output shape: [n_embd_altup, n_layer, n_tokens] + ggml_tensor * get_per_layer_inputs() { + auto inp = std::make_unique(); + ggml_tensor * inp_per_layer; + if (ubatch.token) { + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + ggml_set_input(inp->tokens); + res->t_tokens = inp->tokens; + inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens); + inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float)n_embd_altup)); + cb(inp_per_layer, "inp_per_layer_selected", -1); + } else { + GGML_ABORT("TODO: support embd input"); + } + res->add_input(std::move(inp)); + return inp_per_layer; + } + + // equivalent to project_per_layer_inputs() in python code + // this calculates the per-layer inputs, so the final tensor shape will have n_layer as the last dim + // output shape: [n_embd_altup, n_tokens, n_layer] + ggml_tensor * project_per_layer_inputs(ggml_tensor * inputs_embeds, ggml_tensor * inp_per_layer) { + const float per_layer_projection_scale = 1.0f / sqrtf((float)n_embd); + const float per_layer_input_scale = 1.0f / sqrtf(2.0f); + + ggml_tensor * per_layer_proj = ggml_mul_mat(ctx0, model.per_layer_model_proj, inputs_embeds); + per_layer_proj = ggml_scale(ctx0, per_layer_proj, per_layer_projection_scale); + per_layer_proj = ggml_reshape_3d(ctx0, per_layer_proj, n_embd_altup, n_layer, n_tokens); + per_layer_proj = build_norm(per_layer_proj, + model.per_layer_proj_norm, NULL, + LLM_NORM_RMS, -1); // [n_embd_altup, n_layer, n_tokens] + cb(per_layer_proj, "per_layer_proj", -1); + + inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale); + cb(inp_per_layer, "inp_per_layer", -1); + + // permute to shape: [n_embd_altup, n_tokens, n_layer] + inp_per_layer = ggml_cont(ctx0, ggml_permute(ctx0, inp_per_layer, 0, 2, 1, 3)); + return inp_per_layer; + } + + // input cur shape: [n_altup, n_tokens] + // output shape: [n_altup, n_tokens] + ggml_tensor * laurel(ggml_tensor * cur, int il) { + ggml_tensor * tmp = cur; + tmp = build_lora_mm(model.layers[il].laurel_l, tmp); + tmp = build_lora_mm(model.layers[il].laurel_r, tmp); + tmp = build_norm(tmp, model.layers[il].laurel_post_norm, NULL, LLM_NORM_RMS, il); + tmp = ggml_add(ctx0, tmp, cur); + cb(tmp, "laurel_out", il); + return tmp; + } + + // input x shape: [n_embd, n_tokens] + // output shape: [n_embd, n_tokens] + ggml_tensor * gaussian_topk(ggml_tensor * x) { + ggml_tensor * mean = ggml_mean(ctx0, x); + ggml_tensor * std = ggml_sqrt(ctx0, ggml_scale(ctx0, + ggml_sum_rows(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x, mean))), + 1.0f / (float)(x->ne[0] - 1) + )); + ggml_tensor * cutoff_x = ggml_add(ctx0, mean, ggml_scale(ctx0, std, f_sparsity_std_mul)); + return ggml_relu(ctx0, ggml_sub(ctx0, x, cutoff_x)); + } + + // + // altup functions + // + + // equivalent to compute_router_modalities() in python code + // input x shape: [n_embd, n_tokens] + // output shape: [n_altup, n_tokens] + ggml_tensor * altup_compute_router_modalities(ggml_tensor * x, int il) { + ggml_tensor * router_inputs = build_norm(x, + model.layers[il].altup_router_norm, NULL, + LLM_NORM_RMS, il); + + // router_input_scale + router_inputs = ggml_scale(ctx0, router_inputs, 1.0f / (float)n_embd); + + ggml_tensor * output = ggml_mul_mat(ctx0, model.layers[il].altup_router, router_inputs); + return ggml_tanh(ctx0, output); // [n_altup, n_tokens] + } + + // input cur shape: [n_embd, n_tokens, n_altup] + // output shape: [n_embd, n_tokens, n_altup] + ggml_tensor * altup_predict(ggml_tensor * cur, int il) { + ggml_tensor * activated = view_2d_slice(cur, i_altup_act); // [n_embd, n_tokens] + ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens] + cb(modalities, "modalities", il); + + ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_predict_coef, modalities); + cb(all_coefs, "all_coefs", il); + // first dim now having n_altup^2 elements, we reshape it to 2D (so we end up with 3D tensor) + all_coefs = ggml_reshape_3d(ctx0, all_coefs, n_altup, n_altup, n_tokens); + + // permute to [n_altup, n_embd, n_tokens] + ggml_tensor * cur_permuted = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + ggml_tensor * predictions = ggml_mul_mat(ctx0, cur_permuted, all_coefs); // [n_altup, n_embd, n_tokens] + + // final shape must be the same as cur: [n_embd, n_tokens, n_altup] + predictions = ggml_cont(ctx0, ggml_permute(ctx0, predictions, 0, 2, 1, 3)); + predictions = ggml_add(ctx0, predictions, cur); + cb(predictions, "predictions", il); + + return predictions; + } + + // input predictions shape: [n_embd, n_tokens, n_altup] + // input activated shape: [n_embd, n_tokens] + // output shape: [n_embd, n_tokens, n_altup] + ggml_tensor * altup_correct(ggml_tensor * predictions, ggml_tensor * activated, int il) { + ggml_tensor * modalities = altup_compute_router_modalities(activated, il); // [n_altup, n_tokens] + cb(modalities, "modalities", il); + + ggml_tensor * active_prediction = view_2d_slice(predictions, i_altup_act); + ggml_tensor * innovation = ggml_sub(ctx0, activated, active_prediction); // [n_embd, n_tokens] + cb(innovation, "innovation", il); + + ggml_tensor * all_coefs = build_lora_mm(model.layers[il].altup_correct_coef, modalities); // [n_altup, n_tokens] + all_coefs = ggml_add(ctx0, all_coefs, one); + cb(all_coefs, "all_coefs", il); + all_coefs = ggml_cont(ctx0, ggml_transpose(ctx0, all_coefs)); // [n_tokens, n_altup] + all_coefs = ggml_reshape_3d(ctx0, all_coefs, 1, n_tokens, n_altup); // [1, n_tokens, n_altup] + + innovation = ggml_repeat_4d(ctx0, innovation, n_embd, n_tokens, n_altup, 1); + ggml_tensor * corrected = ggml_mul(ctx0, innovation, all_coefs); // [n_embd, n_tokens, n_altup] + corrected = ggml_add(ctx0, corrected, predictions); // [n_embd, n_tokens, n_altup] + cb(corrected, "corrected", il); + + return corrected; + } +}; + // TODO: move up next to build_starcoder struct llm_build_starcoder2 : public llm_graph_context { llm_build_starcoder2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { @@ -13974,6 +14486,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_GEMMA3N: + { + llm = std::make_unique(*this, params, gf); + } break; case LLM_ARCH_STARCODER2: { llm = std::make_unique(*this, params, gf); @@ -14295,6 +14811,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GEMMA: case LLM_ARCH_GEMMA2: case LLM_ARCH_GEMMA3: + case LLM_ARCH_GEMMA3N: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: case LLM_ARCH_GPTNEOX: diff --git a/src/llama-model.h b/src/llama-model.h index 06e6c6879..40063b790 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -95,6 +95,8 @@ enum llm_type { LLM_TYPE_17B_128E, // llama4 Maverick LLM_TYPE_30B_A3B, LLM_TYPE_235B_A22B, + LLM_TYPE_E2B, + LLM_TYPE_E4B, }; std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type); @@ -316,6 +318,19 @@ struct llama_layer { struct ggml_tensor * ffn_up_scale = nullptr; struct ggml_tensor * ffn_down_scale = nullptr; + // altup & laurel + struct ggml_tensor * per_layer_inp_gate = nullptr; + struct ggml_tensor * per_layer_proj = nullptr; + struct ggml_tensor * per_layer_post_norm = nullptr; + struct ggml_tensor * altup_correct_coef = nullptr; + struct ggml_tensor * altup_correct_scale = nullptr; + struct ggml_tensor * altup_predict_coef = nullptr; + struct ggml_tensor * altup_router = nullptr; + struct ggml_tensor * altup_router_norm = nullptr; + struct ggml_tensor * laurel_l = nullptr; + struct ggml_tensor * laurel_r = nullptr; + struct ggml_tensor * laurel_post_norm = nullptr; + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; @@ -354,6 +369,13 @@ struct llama_model { struct ggml_tensor * conv1d = nullptr; struct ggml_tensor * conv1d_b = nullptr; + // gemma3n altup + struct ggml_tensor * tok_embd_per_layer = nullptr; + struct ggml_tensor * altup_proj = nullptr; + struct ggml_tensor * altup_unembd_proj = nullptr; + struct ggml_tensor * per_layer_model_proj = nullptr; + struct ggml_tensor * per_layer_proj_norm = nullptr; + std::vector layers; llama_model_params params; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43229e193..f4b5713d7 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -223,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q6_K; } } - } else if (name == "token_embd.weight") { + } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") { if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { @@ -830,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + // these are very small (e.g. 4x4) + quantize &= name.find("altup") == std::string::npos; + quantize &= name.find("laurel") == std::string::npos; + + // these are not too big so keep them as it is + quantize &= name.find("per_layer_model_proj") == std::string::npos; + // do not quantize positional embeddings and token types (BERT) quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); From f667f1e6244e1f420512fa66692b7096ff17f366 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 27 Jun 2025 10:42:19 +0200 Subject: [PATCH 19/54] convert : fix broken sentencepiece vocab (#14416) --- convert_hf_to_gguf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4f2339a02..aed595e25 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -936,7 +936,11 @@ class TextModel(ModelBase): scores: list[float] = [-10000.0] * vocab_size toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size - for token_id in range(vocab_size): + for token_id in range(tokenizer.vocab_size()): + if token_id >= vocab_size: + logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') + break + piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) @@ -951,10 +955,6 @@ class TextModel(ModelBase): elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE - if token_id >= vocab_size: - logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}') - break - tokens[token_id] = text scores[token_id] = score toktypes[token_id] = toktype From 8d94219a4a7f2da72ee542019ca01f36af93d1d6 Mon Sep 17 00:00:00 2001 From: Radoslav Gerganov Date: Fri, 27 Jun 2025 16:41:40 +0300 Subject: [PATCH 20/54] ggml : add ggml_set_rows (#14274) * ggml : add ggml_set_rows Add ggml_set_rows(a, b, c) which copies rows from 'b' into 'a' using indices from 'c'. ref: #8366 * use I64 for indices * ggml : add repeat impl for i64 * ggml : add ggml_is_contiguous_rows * ggml : ggml_set_rows support broadcast * ggml : ggml_set_rows support quantized dst ggml-ci * ggml : support GGML_TYPE_F32 ".from_float" trait * ggml : ggml_set_rows update comment + better index name * tests : add ggml_set_rows * metal : add ggml_set_rows implementation ggml-ci * ggml : simplify forward_dup_f32 * ggml : fix supports_op * tests : add comment to set_rows * ggml : leave the repeat_i64 for a separate PR ggml-ci * ggml : set_rows use std::min instead of MIN * ggml : better error message for set_rows unsupported type * metal : perform op->type check only once * tests : more consistent implementation + more tests ggml-ci --------- Co-authored-by: Georgi Gerganov --- examples/eval-callback/eval-callback.cpp | 2 + ggml/include/ggml-cpu.h | 1 + ggml/include/ggml.h | 21 + ggml/src/ggml-cpu/ggml-cpu.c | 10 + ggml/src/ggml-cpu/ggml-cpu.cpp | 1 + ggml/src/ggml-cpu/ops.cpp | 96 ++++- ggml/src/ggml-cpu/ops.h | 1 + ggml/src/ggml-metal/ggml-metal-impl.h | 16 + ggml/src/ggml-metal/ggml-metal.m | 112 +++++- ggml/src/ggml-metal/ggml-metal.metal | 469 ++++++++++++++--------- ggml/src/ggml.c | 41 +- tests/test-backend-ops.cpp | 87 +++++ 12 files changed, 653 insertions(+), 204 deletions(-) diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index fb188f5a9..bbbec6a01 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -55,6 +55,8 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); } else if (type == GGML_TYPE_F32) { v = *(float *) &data[i]; + } else if (type == GGML_TYPE_I64) { + v = (float) *(int64_t *) &data[i]; } else if (type == GGML_TYPE_I32) { v = (float) *(int32_t *) &data[i]; } else if (type == GGML_TYPE_I16) { diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index e3b79d09b..be40b1009 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -134,6 +134,7 @@ extern "C" { GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t); GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 9c4e24023..2b1bd6e0f 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -470,6 +470,7 @@ extern "C" { GGML_OP_TRANSPOSE, GGML_OP_GET_ROWS, GGML_OP_GET_ROWS_BACK, + GGML_OP_SET_ROWS, GGML_OP_DIAG, GGML_OP_DIAG_MASK_INF, GGML_OP_DIAG_MASK_ZERO, @@ -687,6 +688,9 @@ extern "C" { // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); + // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements + GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor); + GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); @@ -1375,6 +1379,23 @@ extern "C" { struct ggml_tensor * b, // row indices struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape + // a TD [n_embd, ne1, ne2, ne3] + // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3 + // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1) + // + // undefined behavior if destination rows overlap + // + // broadcast: + // ne2 % ne11 == 0 + // ne3 % ne12 == 0 + // + // return view(a) + GGML_API struct ggml_tensor * ggml_set_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, // destination + struct ggml_tensor * b, // source + struct ggml_tensor * c); // row indices + GGML_API struct ggml_tensor * ggml_diag( struct ggml_context * ctx, struct ggml_tensor * a); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 7cae96f4b..2042ee71f 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -195,6 +195,7 @@ typedef pthread_t ggml_thread_t; static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { [GGML_TYPE_F32] = { + .from_float = (ggml_from_float_t) ggml_cpu_fp32_to_fp32, .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f32, .vec_dot_type = GGML_TYPE_F32, .nrows = 1, @@ -1817,6 +1818,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_get_rows_back(params, tensor); } break; + case GGML_OP_SET_ROWS: + { + ggml_compute_forward_set_rows(params, tensor); + } break; case GGML_OP_DIAG: { ggml_compute_forward_diag(params, tensor); @@ -2170,6 +2175,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { n_tasks = n_threads; } break; case GGML_OP_GET_ROWS: + case GGML_OP_SET_ROWS: { // FIXME: get_rows can use additional threads, but the cost of launching additional threads // decreases performance with GPU offloading @@ -3124,6 +3130,10 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g return ggml_graph_compute(cgraph, &cplan); } +void ggml_cpu_fp32_to_fp32(const float * x, float * y, int64_t n) { + memcpy(y, x, n * sizeof(float)); +} + void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) { int64_t i = 0; #if defined(__F16C__) diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index a98866a2d..c9daa4c39 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -416,6 +416,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st switch (op->op) { case GGML_OP_CPY: + case GGML_OP_SET_ROWS: return op->type != GGML_TYPE_IQ3_XXS && op->type != GGML_TYPE_IQ3_S && diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 8531baf6c..9f17ea43c 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -696,24 +696,8 @@ static void ggml_compute_forward_dup_f32( if (ggml_is_contiguous(dst)) { // TODO: simplify if (nb00 == sizeof(float)) { - if (dst->type == GGML_TYPE_F32) { - size_t id = 0; - const size_t rs = ne00 * nb00; - char * dst_ptr = (char *) dst->data; - - for (int i03 = 0; i03 < ne03; i03++) { - for (int i02 = 0; i02 < ne02; i02++) { - id += rs * ir0; - for (int i01 = ir0; i01 < ir1; i01++) { - const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; - memcpy(dst_ptr + id, src0_ptr, rs); - id += rs; - } - id += rs * (ne01 - ir1); - } - } - } else if (ggml_get_type_traits_cpu(dst->type)->from_float) { - ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float; + if (ggml_get_type_traits_cpu(dst->type)->from_float) { + ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float; size_t id = 0; size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); @@ -724,7 +708,7 @@ static void ggml_compute_forward_dup_f32( id += rs * ir0; for (int i01 = ir0; i01 < ir1; i01++) { const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - quantize_row_q(src0_ptr, dst_ptr + id, ne00); + from_float(src0_ptr, dst_ptr + id, ne00); id += rs; } id += rs * (ne01 - ir1); @@ -2300,6 +2284,12 @@ void ggml_compute_forward_repeat( { ggml_compute_forward_repeat_f32(params, dst); } break; + // TODO: templateify the implemenation and support for I64 + // ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225 + //case GGML_TYPE_I64: + // { + // ggml_compute_forward_repeat_i64(params, dst); + // } break; default: { GGML_ABORT("fatal error"); @@ -4470,6 +4460,74 @@ void ggml_compute_forward_get_rows( //} } +static void ggml_compute_forward_set_rows_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ne01; + + assert(ne0 == nc); + assert(ne2 == ne02); + assert(ne3 == ne03); + assert(src0->type == GGML_TYPE_F32); + assert(ne02 % ne11 == 0); + assert(ne03 % ne12 == 0); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int64_t dr = (nr + nth - 1)/nth; + + // row range for this thread + const int64_t ir0 = dr*ith; + const int64_t ir1 = std::min(ir0 + dr, nr); + + ggml_from_float_t const from_float = ggml_get_type_traits_cpu(dst->type)->from_float; + + for (int64_t i03 = 0; i03 < ne03; ++i03) { + for (int64_t i02 = 0; i02 < ne02; ++i02) { + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i03%ne12; + const int64_t i11 = i02%ne11; + const int64_t i10 = i; + + const int64_t i1 = *(int64_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + GGML_ASSERT(i1 >= 0 && i1 < ne1); + + from_float( + (const float *) ((char *) src0->data + i*nb01 + i02*nb02 + i03*nb03), + ((char *) dst->data + i1*nb1 + i02*nb2 + i03*nb3), nc); + } + } + } +} + +void ggml_compute_forward_set_rows( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_set_rows_f32(params, dst); + } break; + default: + { + GGML_ABORT("src0->type = %d (%s) not supported", src0->type, ggml_type_name(src0->type)); + } + } +} + // ggml_compute_forward_get_rows_back static void ggml_compute_forward_get_rows_back_f32_f16( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 2d8544d7d..3a395fdcd 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -53,6 +53,7 @@ void ggml_compute_forward_permute(const struct ggml_compute_params * params, str void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_set_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 17eab976f..260440aed 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -521,6 +521,22 @@ typedef struct { uint64_t nb2; } ggml_metal_kargs_get_rows; +typedef struct { + int32_t nk0; + int32_t ne01; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + int32_t ne11; + int32_t ne12; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb1; + uint64_t nb2; + uint64_t nb3; +} ggml_metal_kargs_set_rows; + typedef struct { int64_t ne00; int64_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index d8d30cc0b..349f0ff99 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -202,6 +202,15 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, + GGML_METAL_KERNEL_TYPE_SET_ROWS_F32, + GGML_METAL_KERNEL_TYPE_SET_ROWS_F16, + GGML_METAL_KERNEL_TYPE_SET_ROWS_BF16, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q8_0, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_0, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_1, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_0, + GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, + GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_RMS_NORM, GGML_METAL_KERNEL_TYPE_L2_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, @@ -1169,6 +1178,15 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_F32, set_rows_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_F16, set_rows_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_BF16, set_rows_bf16, use_bfloat); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q8_0, set_rows_q8_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_0, set_rows_q4_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_1, set_rows_q4_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_0, set_rows_q5_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1, set_rows_q5_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL, set_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM, l2_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction); @@ -1635,6 +1653,10 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex const bool use_bfloat = ctx_dev->use_bfloat; if (!use_bfloat) { + if (op->type == GGML_TYPE_BF16) { + return false; + } + for (size_t i = 0, n = 3; i < n; ++i) { if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) { return false; @@ -1804,6 +1826,27 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex { return op->ne[3] == 1; } + case GGML_OP_SET_ROWS: + { + if (op->src[0]->type != GGML_TYPE_F32) { + return false; + } + + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_BF16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_IQ4_NL: + return true; + default: + return false; + }; + } default: return false; } @@ -3777,13 +3820,74 @@ static bool ggml_metal_encode_node( }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&args length:sizeof(args) atIndex:3]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)]; } break; + case GGML_OP_SET_ROWS: + { + id pipeline = nil; + + switch (dst->type) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_F32 ].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_F16 ].pipeline; break; + case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_BF16 ].pipeline; break; + case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q8_0 ].pipeline; break; + case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_0 ].pipeline; break; + case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q4_1 ].pipeline; break; + case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_0 ].pipeline; break; + case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_Q5_1 ].pipeline; break; + case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SET_ROWS_IQ4_NL].pipeline; break; + default: GGML_ABORT("not implemented"); + } + + const int32_t nk0 = ne0/ggml_blck_size(dst->type); + + int nth = 32; // SIMD width + + while (nth < nk0 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) { + nth *= 2; + } + + int nrptg = 1; + if (nth > nk0) { + nrptg = (nth + nk0 - 1)/nk0; + nth = nk0; + + if (nrptg*nth > (int) pipeline.maxTotalThreadsPerThreadgroup) { + nrptg--; + } + } + + nth = MIN(nth, nk0); + + ggml_metal_kargs_set_rows args = { + /*.nk0 =*/ nk0, + /*.ne01 =*/ ne01, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne11 =*/ ne11, + /*.ne12 =*/ ne12, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb1 =*/ nb1, + /*.nb2 =*/ nb2, + /*.nb3 =*/ nb3, + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:3]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)]; + } break; case GGML_OP_RMS_NORM: { GGML_ASSERT(ne00 % 4 == 0); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 5f004a856..984a0ab50 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -35,6 +35,17 @@ constexpr constant static float kvalues_iq4nl_f[16] = { -127.f, -104.f, -83.f, -65.f, -49.f, -35.f, -22.f, -10.f, 1.f, 13.f, 25.f, 38.f, 53.f, 69.f, 89.f, 113.f }; +static inline int best_index_int8(int n, constant float * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} + // NOTE: this is not dequantizing - we are simply fitting the template template void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) { @@ -97,6 +108,173 @@ void dequantize_q4_0_t4(device const block_q4_0 * xb, short il, thread type4 & r } } +void quantize_q4_0(device const float * src, device block_q4_0 & dst) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK4_0; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / -8; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK4_0/2 + j]*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); + + dst.qs[j] = xi0; + dst.qs[j] |= xi1 << 4; + } +} + +void quantize_q4_1(device const float * src, device block_q4_1 & dst) { + float min = FLT_MAX; + float max = -FLT_MAX; + + for (int j = 0; j < QK4_1; j++) { + const float v = src[j]; + if (min > v) min = v; + if (max < v) max = v; + } + + const float d = (max - min) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + dst.m = min; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (src[0 + j] - min)*id; + const float x1 = (src[QK4_1/2 + j] - min)*id; + + const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); + + dst.qs[j] = xi0; + dst.qs[j] |= xi1 << 4; + } +} + +void quantize_q5_0(device const float * src, device block_q5_0 & dst) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK5_0; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / -16; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + uint32_t qh = 0; + for (int j = 0; j < QK5_0/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK5_0/2 + j]*id; + + const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); + + dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); + } + + thread const uint8_t * qh8 = (thread const uint8_t *)&qh; + + for (int j = 0; j < 4; ++j) { + dst.qh[j] = qh8[j]; + } +} + +void quantize_q5_1(device const float * src, device block_q5_1 & dst) { + float max = src[0]; + float min = src[0]; + + for (int j = 1; j < QK5_1; j++) { + const float v = src[j]; + min = v < min ? v : min; + max = v > max ? v : max; + } + + const float d = (max - min) / 31; + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + dst.m = min; + + uint32_t qh = 0; + for (int j = 0; j < QK5_1/2; ++j) { + const float x0 = (src[0 + j] - min)*id; + const float x1 = (src[QK5_1/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + dst.qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); + } + + thread const uint8_t * qh8 = (thread const uint8_t *)&qh; + + for (int j = 0; j < 4; ++j) { + dst.qh[j] = qh8[j]; + } +} + +void quantize_iq4_nl(device const float * src, device block_iq4_nl & dst) { + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int j = 0; j < QK4_NL; j++) { + const float v = src[j]; + if (amax < fabs(v)) { + amax = fabs(v); + max = v; + } + } + + const float d = max / kvalues_iq4nl_f[0]; + const float id = d ? 1.0f/d : 0.0f; + + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + const float x0 = src[0 + j]*id; + const float x1 = src[QK4_NL/2 + j]*id; + + const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0); + const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1); + + dst.qs[j] = xi0 | (xi1 << 4); + + const float v0 = kvalues_iq4nl_f[xi0]; + const float v1 = kvalues_iq4nl_f[xi1]; + const float w0 = src[0 + j]*src[0 + j]; + const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j]; + sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j]; + sumq2 += w0*v0*v0 + w1*v1*v1; + + } + + dst.d = sumq2 > 0 ? sumqx/sumq2 : d; +} + template void dequantize_q4_1(device const block_q4_1 * xb, short il, thread type4x4 & reg) { device const uint16_t * qs = ((device const uint16_t *)xb + 2); @@ -279,6 +457,26 @@ void dequantize_q8_0_t4(device const block_q8_0 *xb, short il, thread type4 & re } } +void quantize_q8_0(device const float * src, device block_q8_0 & dst) { + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = src[j]; + amax = MAX(amax, fabs(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dst.d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = src[j]*id; + + dst.qs[j] = round(x0); + } +} + template void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg) { const float d = xb->d; @@ -4410,6 +4608,7 @@ template [[host_name("kernel_cpy_bf16_f32")]] kernel kernel_cpy_t kernel_cpy; #endif +// TODO: templetify these kernels kernel void kernel_cpy_f32_q8_0( constant ggml_metal_kargs_cpy & args, device const char * src0, @@ -4433,23 +4632,7 @@ kernel void kernel_cpy_f32_q8_0( for (int64_t i00 = tpitg.x*QK8_0; i00 < args.ne00; i00 += ntg.x*QK8_0) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = src[j]; - amax = MAX(amax, fabs(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK8_0].d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = src[j]*id; - - dst_data[i00/QK8_0].qs[j] = round(x0); - } + quantize_q8_0(src, dst_data[i00/QK8_0]); } } @@ -4476,32 +4659,7 @@ kernel void kernel_cpy_f32_q4_0( for (int64_t i00 = tpitg.x*QK4_0; i00 < args.ne00; i00 += ntg.x*QK4_0) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK4_0; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / -8; - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK4_0].d = d; - - for (int j = 0; j < QK4_0/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK4_0/2 + j]*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); - - dst_data[i00/QK4_0].qs[j] = xi0; - dst_data[i00/QK4_0].qs[j] |= xi1 << 4; - } + quantize_q4_0(src, dst_data[i00/QK4_0]); } } @@ -4528,31 +4686,7 @@ kernel void kernel_cpy_f32_q4_1( for (int64_t i00 = tpitg.x*QK4_1; i00 < args.ne00; i00 += ntg.x*QK4_1) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float min = FLT_MAX; - float max = -FLT_MAX; - - for (int j = 0; j < QK4_1; j++) { - const float v = src[j]; - if (min > v) min = v; - if (max < v) max = v; - } - - const float d = (max - min) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK4_1].d = d; - dst_data[i00/QK4_1].m = min; - - for (int j = 0; j < QK4_1/2; ++j) { - const float x0 = (src[0 + j] - min)*id; - const float x1 = (src[QK4_1/2 + j] - min)*id; - - const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); - - dst_data[i00/QK4_1].qs[j] = xi0; - dst_data[i00/QK4_1].qs[j] |= xi1 << 4; - } + quantize_q4_1(src, dst_data[i00/QK4_1]); } } @@ -4579,38 +4713,7 @@ kernel void kernel_cpy_f32_q5_0( for (int64_t i00 = tpitg.x*QK5_0; i00 < args.ne00; i00 += ntg.x*QK5_0) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK5_0; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / -16; - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK5_0].d = d; - - uint32_t qh = 0; - for (int j = 0; j < QK5_0/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK5_0/2 + j]*id; - - const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f)); - const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f)); - - dst_data[i00/QK5_0].qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); - } - thread const uint8_t * qh8 = (thread const uint8_t *)&qh; - for (int j = 0; j < 4; ++j) { - dst_data[i00/QK5_0].qh[j] = qh8[j]; - } + quantize_q5_0(src, dst_data[i00/QK5_0]); } } @@ -4637,51 +4740,10 @@ kernel void kernel_cpy_f32_q5_1( for (int64_t i00 = tpitg.x*QK5_1; i00 < args.ne00; i00 += ntg.x*QK5_1) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float max = src[0]; - float min = src[0]; - - for (int j = 1; j < QK5_1; j++) { - const float v = src[j]; - min = v < min ? v : min; - max = v > max ? v : max; - } - - const float d = (max - min) / 31; - const float id = d ? 1.0f/d : 0.0f; - - dst_data[i00/QK5_1].d = d; - dst_data[i00/QK5_1].m = min; - - uint32_t qh = 0; - for (int j = 0; j < QK5_1/2; ++j) { - const float x0 = (src[0 + j] - min)*id; - const float x1 = (src[QK5_1/2 + j] - min)*id; - - const uint8_t xi0 = (uint8_t)(x0 + 0.5f); - const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - - dst_data[i00/QK5_1].qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); - } - thread const uint8_t * qh8 = (thread const uint8_t *)&qh; - for (int j = 0; j < 4; ++j) { - dst_data[i00/QK5_1].qh[j] = qh8[j]; - } + quantize_q5_1(src, dst_data[i00/QK5_1]); } } -static inline int best_index_int8(int n, constant float * val, float x) { - if (x <= val[0]) return 0; - if (x >= val[n-1]) return n-1; - int ml = 0, mu = n-1; - while (mu-ml > 1) { - int mav = (ml+mu)/2; - if (x < val[mav]) mu = mav; else ml = mav; - } - return x - val[mu-1] < val[mu] - x ? mu-1 : mu; -} - kernel void kernel_cpy_f32_iq4_nl( constant ggml_metal_kargs_cpy & args, device const char * src0, @@ -4705,40 +4767,7 @@ kernel void kernel_cpy_f32_iq4_nl( for (int64_t i00 = tpitg.x*QK4_NL; i00 < args.ne00; i00 += ntg.x*QK4_NL) { device const float * src = (device float *)(src0 + i03*args.nb03 + i02*args.nb02 + i01*args.nb01 + i00*args.nb00); - float amax = 0.0f; // absolute max - float max = 0.0f; - - for (int j = 0; j < QK4_NL; j++) { - const float v = src[j]; - if (amax < fabs(v)) { - amax = fabs(v); - max = v; - } - } - - const float d = max / kvalues_iq4nl_f[0]; - const float id = d ? 1.0f/d : 0.0f; - - float sumqx = 0, sumq2 = 0; - for (int j = 0; j < QK4_NL/2; ++j) { - const float x0 = src[0 + j]*id; - const float x1 = src[QK4_NL/2 + j]*id; - - const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl_f, x0); - const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl_f, x1); - - dst_data[i00/QK4_NL].qs[j] = xi0 | (xi1 << 4); - - const float v0 = kvalues_iq4nl_f[xi0]; - const float v1 = kvalues_iq4nl_f[xi1]; - const float w0 = src[0 + j]*src[0 + j]; - const float w1 = src[QK4_NL/2 + j]*src[QK4_NL/2 + j]; - sumqx += w0*v0*src[j] + w1*v1*src[QK4_NL/2 + j]; - sumq2 += w0*v0*v0 + w1*v1*v1; - - } - - dst_data[i00/QK4_NL].d = sumq2 > 0 ? sumqx/sumq2 : d; + quantize_iq4_nl(src, dst_data[i00/QK4_NL]); } } @@ -6419,10 +6448,10 @@ kernel void kernel_mul_mv_iq4_xs_f32( template kernel void kernel_get_rows_q( + constant ggml_metal_kargs_get_rows & args, device const void * src0, device const void * src1, device float * dst, - constant ggml_metal_kargs_get_rows & args, uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], uint3 tptg [[threads_per_threadgroup]]) { @@ -6442,10 +6471,10 @@ kernel void kernel_get_rows_q( template kernel void kernel_get_rows_f( + constant ggml_metal_kargs_get_rows & args, device const void * src0, device const void * src1, device float * dst, - constant ggml_metal_kargs_get_rows & args, uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], uint3 tptg [[threads_per_threadgroup]]) { @@ -6463,10 +6492,10 @@ kernel void kernel_get_rows_f( } kernel void kernel_get_rows_i32( + constant ggml_metal_kargs_get_rows & args, device const void * src0, device const void * src1, device int32_t * dst, - constant ggml_metal_kargs_get_rows & args, uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], uint3 tptg [[threads_per_threadgroup]]) { @@ -6483,6 +6512,67 @@ kernel void kernel_get_rows_i32( } } +template +kernel void kernel_set_rows_q32( + constant ggml_metal_kargs_set_rows & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg [[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + + const int32_t i12 = i03%args.ne12; + const int32_t i11 = i02%args.ne11; + + const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x; + if (i01 >= args.ne01) { + return; + } + + const int32_t i10 = i01; + const int64_t i1 = ((const device int64_t *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0]; + + device block_q * dst_row = ( device block_q *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3); + const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + + for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) { + quantize_func(src_row + 32*ind, dst_row[ind]); + } +} + +template +kernel void kernel_set_rows_f( + constant ggml_metal_kargs_set_rows & args, + device const void * src0, + device const void * src1, + device float * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint3 tptg [[threads_per_threadgroup]]) { + const int32_t i03 = tgpig.z; + const int32_t i02 = tgpig.y; + + const int32_t i12 = i03%args.ne12; + const int32_t i11 = i02%args.ne11; + + const int32_t i01 = tgpig.x*tptg.y + tiitg/tptg.x; + if (i01 >= args.ne01) { + return; + } + + const int32_t i10 = i01; + const int64_t i1 = ((const device int64_t *) ((const device char *) src1 + i10*args.nb10 + i11*args.nb11 + i12*args.nb12))[0]; + + device T * dst_row = ( device T *) (( device char *) dst + i1*args.nb1 + i02*args.nb2 + i03*args.nb3); + const device float * src_row = (const device float *) ((const device char *) src0 + i01*args.nb01 + i02*args.nb02 + i03*args.nb03); + + for (int ind = tiitg%tptg.x; ind < args.nk0; ind += tptg.x) { + dst_row[ind] = (T) src_row[ind]; + } +} #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A #define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B @@ -6906,6 +6996,27 @@ template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_q_t kernel_get template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_q_t kernel_get_rows_q; template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_q_t kernel_get_rows_q; +// +// set rows +// + +typedef decltype(kernel_set_rows_f) set_rows_f_t; + +template [[host_name("kernel_set_rows_f32")]] kernel set_rows_f_t kernel_set_rows_f; +template [[host_name("kernel_set_rows_f16")]] kernel set_rows_f_t kernel_set_rows_f; +#if defined(GGML_METAL_USE_BF16) +template [[host_name("kernel_set_rows_bf16")]] kernel set_rows_f_t kernel_set_rows_f; +#endif + +typedef decltype(kernel_set_rows_q32) set_rows_q32_t; + +template [[host_name("kernel_set_rows_q8_0")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q4_0")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q4_1")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q5_0")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_q5_1")]] kernel set_rows_q32_t kernel_set_rows_q32; +template [[host_name("kernel_set_rows_iq4_nl")]] kernel set_rows_q32_t kernel_set_rows_q32; + // // matrix-matrix multiplication // diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ee605977f..3d04f80ef 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -933,6 +933,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "TRANSPOSE", "GET_ROWS", "GET_ROWS_BACK", + "SET_ROWS", "DIAG", "DIAG_MASK_INF", "DIAG_MASK_ZERO", @@ -983,7 +984,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1029,6 +1030,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "transpose(x)", "get_rows(x)", "get_rows_back(x)", + "set_rows(x)", "diag(x)", "diag_mask_inf(x)", "diag_mask_zero(x)", @@ -1079,7 +1081,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1348,6 +1350,12 @@ bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) { tensor->nb[2] == ggml_type_size(tensor->type); } +bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor) { + return + tensor->ne[0] == ggml_blck_size(tensor->type) || + tensor->nb[0] == ggml_type_size(tensor->type); +} + static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -3384,6 +3392,35 @@ struct ggml_tensor * ggml_get_rows_back( return result; } +// ggml_set_rows + +struct ggml_tensor * ggml_set_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c) { + GGML_ASSERT(a->ne[0] == b->ne[0]); + GGML_ASSERT(a->ne[2] == b->ne[2]); + GGML_ASSERT(a->ne[3] == b->ne[3]); + GGML_ASSERT(b->ne[1] == c->ne[0]); + GGML_ASSERT(b->ne[2] % c->ne[1] == 0); + GGML_ASSERT(b->ne[3] % c->ne[2] == 0); + GGML_ASSERT(c->ne[3] == 1); + GGML_ASSERT(b->type == GGML_TYPE_F32); + GGML_ASSERT(c->type == GGML_TYPE_I64); + + GGML_ASSERT(ggml_is_contiguous_rows(a)); + GGML_ASSERT(ggml_is_contiguous_rows(b)); + + struct ggml_tensor * result = ggml_view_tensor(ctx, a); + + result->op = GGML_OP_SET_ROWS; + result->src[0] = b; + result->src[1] = c; + + return result; +} + // ggml_diag struct ggml_tensor * ggml_diag( diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 615c2dc00..a233f1f2f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1213,6 +1213,76 @@ struct test_get_rows_back : public test_case { } }; +// GGML_OP_SET_ROWS +struct test_set_rows : public test_case { + const ggml_type type; + const std::array ne; + const std::array nr23; // broadcast only dims 2 and 3 + const int r; // rows to set + const bool v; // view (non-contiguous src1) + + std::string vars() override { + return VARS_TO_STR5(type, ne, nr23, r, v); + } + + test_set_rows(ggml_type type, + std::array ne, + std::array nr23, + int r, bool v = false) + : type(type), ne(ne), nr23(nr23), r(r), v(v) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * dst = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], ne[2]*nr23[0], ne[3]*nr23[1]); + ggml_set_name(dst, "dst"); + + ggml_tensor * src = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], r, ne[2]*nr23[0], ne[3]*nr23[1]); + ggml_set_name(src, "src"); + + ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, GGML_TYPE_I64, r, ne[2], ne[3]); + ggml_set_name(row_idxs, "row_idxs"); + + if (v) { + src = ggml_view_4d(ctx, src, ne[0], r/2, ne[2]*nr23[0], ne[3]*nr23[1], src->nb[1], src->nb[2], src->nb[3], 0); + row_idxs = ggml_view_3d(ctx, row_idxs, r/2, ne[2], ne[3], row_idxs->nb[1], row_idxs->nb[2], 0); + ggml_set_name(row_idxs, "view_of_rows"); + } + + ggml_tensor * out = ggml_set_rows(ctx, dst, src, row_idxs); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + std::random_device rd; + std::default_random_engine rng(rd()); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type == GGML_TYPE_I64) { + if (ggml_is_view_op(t->op)) { + continue; + } + + for (int i2 = 0; i2 < t->ne[2]; i2++) { + for (int i1 = 0; i1 < t->ne[1]; i1++) { + // generate a shuffled subset of row indices + std::vector data(ne[1]); + for (int i = 0; i < ne[1]; i++) { + data[i] = i; + } + std::shuffle(data.begin(), data.end(), rng); + data.resize(t->ne[0]); + + const size_t offs = i1*t->nb[1] + i2*t->nb[2]; + ggml_backend_tensor_set(t, data.data(), offs, t->ne[0]*sizeof(int64_t)); + } + } + } else { + init_tensor_uniform(t); + } + } + } +}; + // GGML_OP_ARGMAX struct test_argmax : public test_case { const ggml_type type; @@ -3984,6 +4054,23 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_get_rows_back(GGML_TYPE_I32, 256, 5, 4, 1, v)); } + test_cases.emplace_back(new test_set_rows(GGML_TYPE_F32, { 1, 8, 1, 3 }, { 1, 1 }, 2, false)); + for (ggml_type type : all_types) { + for (int b : {1, 7}) { + for (bool v : {false, true}) { + test_cases.emplace_back(new test_set_rows(type, { 256, 5, b, 3 }, { 1, 1, }, 1, v)); + test_cases.emplace_back(new test_set_rows(type, { 256, 11, 1, b }, { 2, 3, }, 7, v)); + + test_cases.emplace_back(new test_set_rows(type, { 3*ggml_blck_size(type), 3, b, 1 }, { 2, 3, }, 2, v)); + + if (ggml_blck_size(type) == 1) { + test_cases.emplace_back(new test_set_rows(type, { 31, 3, b, 1 }, { 2, 3, }, 2, v)); + test_cases.emplace_back(new test_set_rows(type, { 33, 5, 1, b }, { 2, 3, }, 1, v)); + } + } + } + } + for (ggml_type type_input : {GGML_TYPE_F32}) { for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) { for (int k0 : {1, 3}) { From 43678060c1f4cfab2f899466fd615e358677f807 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 27 Jun 2025 17:55:45 +0300 Subject: [PATCH 21/54] recurrent : call balloc split_reset() in init_batch() (#14414) ggml-ci --- src/llama-memory-recurrent.cpp | 37 +++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index 1b1e95d56..e52156bf3 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -363,30 +363,35 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const { } llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { - std::vector ubatches; + do { + balloc.split_reset(); - while (true) { - llama_ubatch ubatch; + std::vector ubatches; + while (true) { + llama_ubatch ubatch; - if (embd_all) { - // if all tokens are output, split by sequence - ubatch = balloc.split_seq(n_ubatch); - } else { - ubatch = balloc.split_equal(n_ubatch); + if (embd_all) { + // if all tokens are output, split by sequence + ubatch = balloc.split_seq(n_ubatch); + } else { + ubatch = balloc.split_equal(n_ubatch); + } + + if (ubatch.n_tokens == 0) { + break; + } + + ubatches.push_back(std::move(ubatch)); // NOLINT } - if (ubatch.n_tokens == 0) { + if (!prepare(ubatches)) { break; } - ubatches.push_back(std::move(ubatch)); // NOLINT - } + return std::make_unique(this, std::move(ubatches)); + } while (false); - if (!prepare(ubatches)) { - return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); - } - - return std::make_unique(this, std::move(ubatches)); + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); } llama_memory_context_ptr llama_memory_recurrent::init_full() { From 72babea5dea56c8a8e8420ccf731b12a5cf37854 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 27 Jun 2025 21:42:02 +0300 Subject: [PATCH 22/54] graph : make llm_graph_context destructor virtual (#14410) ggml-ci --- src/llama-graph.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-graph.h b/src/llama-graph.h index 4b1ec354d..ee2197e89 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -475,6 +475,7 @@ struct llm_graph_context { std::unique_ptr res; llm_graph_context(const llm_graph_params & params); + virtual ~llm_graph_context() = default; void cb(ggml_tensor * cur, const char * name, int il) const; From ceb1bf5a34d5e66e28b23dcc7a3cd83fe1e27481 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Fri, 27 Jun 2025 22:35:30 -0500 Subject: [PATCH 23/54] vulkan: Fix GGML_VULKAN_SHADER_DEBUG_INFO (#14427) This setting needs to be passed through to vulkan-shaders-gen --- ggml/src/ggml-vulkan/CMakeLists.txt | 1 + ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 0bf4cb14f..b97e7bf99 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -99,6 +99,7 @@ if (Vulkan_FOUND) if (GGML_VULKAN_SHADER_DEBUG_INFO) add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO) + list(APPEND VULKAN_SHADER_GEN_CMAKE_ARGS -DGGML_VULKAN_SHADER_DEBUG_INFO=ON) endif() if (GGML_VULKAN_VALIDATE) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index 14e9daaa0..e1f613fb4 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -19,6 +19,10 @@ if (GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) add_compile_definitions(GGML_VULKAN_BFLOAT16_GLSLC_SUPPORT) message(STATUS "Enabling bfloat16 glslc support") endif() +if (GGML_VULKAN_SHADER_DEBUG_INFO) + add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO) + message(STATUS "Enabling shader debug info") +endif() set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) From 6609507a910aa7437aaa53fd999447de3947d998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 28 Jun 2025 09:57:07 +0200 Subject: [PATCH 24/54] ci : fix windows build and release (#14431) --- .github/workflows/build.yml | 18 +++++++++++++----- .github/workflows/release.yml | 12 ++++++------ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4feccf21e..4ea8ea3c0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -664,7 +664,7 @@ jobs: ./build-xcframework.sh windows-msys2: - runs-on: windows-latest + runs-on: windows-2025 strategy: fail-fast: false @@ -714,7 +714,7 @@ jobs: cmake --build build --config ${{ matrix.build }} -j $(nproc) windows-latest-cmake: - runs-on: windows-latest + runs-on: windows-2025 env: OPENBLAS_VERSION: 0.3.23 @@ -725,16 +725,22 @@ jobs: matrix: include: - build: 'cpu-x64 (static)' + arch: 'x64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF' - build: 'openblas-x64' + arch: 'x64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - build: 'vulkan-x64' + arch: 'x64' defines: '-DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' - build: 'llvm-arm64' + arch: 'arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' - build: 'llvm-arm64-opencl-adreno' + arch: 'arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' # - build: 'kompute-x64' + # arch: 'x64' # defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' steps: @@ -805,6 +811,8 @@ jobs: - name: libCURL id: get_libcurl uses: ./.github/actions/windows-setup-curl + with: + architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }} - name: Build id: cmake_build @@ -825,7 +833,7 @@ jobs: - name: Test id: cmake_test - if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }} + if: ${{ matrix.arch == 'x64' }} run: | cd build ctest -L main -C Release --verbose --timeout 900 @@ -930,7 +938,7 @@ jobs: cmake --build build --config Release windows-latest-cmake-sycl: - runs-on: windows-latest + runs-on: windows-2022 defaults: run: @@ -964,7 +972,7 @@ jobs: windows-latest-cmake-hip: if: ${{ github.event.inputs.create_release != 'true' }} - runs-on: windows-latest + runs-on: windows-2022 steps: - name: Clone diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 64fff175e..7c95a61fc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -235,7 +235,7 @@ jobs: name: llama-bin-ubuntu-vulkan-x64.zip windows-cpu: - runs-on: windows-latest + runs-on: windows-2025 strategy: matrix: @@ -271,7 +271,7 @@ jobs: env: CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch }} + call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" ${{ matrix.arch == 'x64' && 'x64' || 'amd64_arm64' }} cmake -S . -B build -G "Ninja Multi-Config" ^ -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ^ -DGGML_NATIVE=OFF ^ @@ -288,7 +288,7 @@ jobs: CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\ - Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.42.34433\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\ + Copy-Item "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Redist\MSVC\14.44.35112\debug_nonredist\${{ matrix.arch }}\Microsoft.VC143.OpenMP.LLVM\libomp140.${{ matrix.arch == 'x64' && 'x86_64' || 'aarch64' }}.dll" .\build\bin\Release\ 7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\* - name: Upload artifacts @@ -298,7 +298,7 @@ jobs: name: llama-bin-win-cpu-${{ matrix.arch }}.zip windows: - runs-on: windows-latest + runs-on: windows-2025 env: OPENBLAS_VERSION: 0.3.23 @@ -448,7 +448,7 @@ jobs: name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip windows-sycl: - runs-on: windows-latest + runs-on: windows-2022 defaults: run: @@ -520,7 +520,7 @@ jobs: name: llama-bin-win-sycl-x64.zip windows-hip: - runs-on: windows-latest + runs-on: windows-2022 strategy: matrix: From b25e92774e2fa4ee3820e458d5cf43f40190f8d2 Mon Sep 17 00:00:00 2001 From: Xinpeng Dou <15529241576@163.com> Date: Sat, 28 Jun 2025 17:35:41 +0800 Subject: [PATCH 25/54] fix async_mode bug (#14432) --- ggml/src/ggml-cann/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h index ba2cef0c2..8dfe3b061 100755 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -359,7 +359,7 @@ struct ggml_backend_cann_context { ggml_cann_set_device(device); description = aclrtGetSocName(); - bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or("")); + async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or("")); GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__, device, async_mode ? "ON" : "OFF"); } From 566c16fcce44876a167c37f159085afe6f84b28c Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Sat, 28 Jun 2025 22:08:21 +0800 Subject: [PATCH 26/54] model : add support for ERNIE 4.5 0.3B model (#14408) Add Day-0 support for Baidu ERNIE 4.5 0.3B model. Signed-off-by: Weizhao Ouyang --- convert_hf_to_gguf.py | 46 ++++++++++ gguf-py/gguf/constants.py | 16 ++++ src/llama-arch.cpp | 18 ++++ src/llama-arch.h | 1 + src/llama-model.cpp | 178 ++++++++++++++++++++++++++++++++++++++ src/llama-model.h | 1 + 6 files changed, 260 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index aed595e25..c2c55166e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2743,6 +2743,52 @@ class Qwen2Model(TextModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Ernie4_5_ForCausalLM") +class Ernie4_5Model(TextModel): + model_arch = gguf.MODEL_ARCH.ERNIE4_5 + + def set_vocab(self): + self._set_vocab_sentencepiece() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + head_dim = self.hparams["head_dim"] + + if "ernie." in name: + name = name.replace("ernie.", "model.") + # split the qkv weights + # qkv_proj shape: [(num_heads + 2 * num_kv_heads) * head_dim, hidden_size] + if "qkv_proj" in name: + name_q = name.replace("qkv_proj.weight", "q_proj.weight") + name_k = name.replace("qkv_proj.weight", "k_proj.weight") + name_v = name.replace("qkv_proj.weight", "v_proj.weight") + total_q_dim = num_heads * head_dim + total_k_dim = num_kv_heads * head_dim + total_v_dim = num_kv_heads * head_dim + q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0) + return [ + (self.map_tensor_name(name_q), q_proj_weight), + (self.map_tensor_name(name_k), k_proj_weight), + (self.map_tensor_name(name_v), v_proj_weight) + ] + # split the up_gate_proj into gate and up + # up_gate_proj shape: [2 * intermediate_size, hidden_size] + if "up_gate_proj" in name: + name_up = name.replace("up_gate_proj.weight", "up_proj.weight") + name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight") + dim_half = data_torch.shape[0] // 2 + gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0) + return [ + (self.map_tensor_name(name_gate), gate_proj_weight), + (self.map_tensor_name(name_up), up_proj_weight) + ] + return [(self.map_tensor_name(name), data_torch)] + + @ModelBase.register( "Qwen2VLModel", "Qwen2VLForConditionalGeneration", diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index fb75143b0..b5ba933cb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -354,6 +354,7 @@ class MODEL_ARCH(IntEnum): BAILINGMOE = auto() DOTS1 = auto() ARCEE = auto() + ERNIE4_5 = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -654,6 +655,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.BAILINGMOE: "bailingmoe", MODEL_ARCH.DOTS1: "dots1", MODEL_ARCH.ARCEE: "arcee", + MODEL_ARCH.ERNIE4_5: "ernie4_5", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -2177,6 +2179,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.ERNIE4_5: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 435e3b9ba..aa21108a4 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -76,6 +76,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_DOTS1, "dots1" }, { LLM_ARCH_ARCEE, "arcee" }, + { LLM_ARCH_ERNIE4_5, "ernie4_5" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1658,6 +1659,23 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, } }, + { + LLM_ARCH_ERNIE4_5, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 9181ad053..0771ec3eb 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -80,6 +80,7 @@ enum llm_arch { LLM_ARCH_BAILINGMOE, LLM_ARCH_DOTS1, LLM_ARCH_ARCEE, + LLM_ARCH_ERNIE4_5, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fc39195ed..b15bf73c2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -47,6 +47,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_475M: return "475M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; + case LLM_TYPE_0_3B: return "0.3B"; case LLM_TYPE_0_5B: return "0.5B"; case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; @@ -1504,6 +1505,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_ERNIE4_5: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 18: type = LLM_TYPE_0_3B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -4344,6 +4353,40 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; + case LLM_ARCH_ERNIE4_5: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + // optional bias tensors + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } @@ -14125,6 +14168,136 @@ struct llm_build_dots1 : public llm_graph_context { } }; +struct llm_build_ernie4_5 : public llm_graph_context { + llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + { + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + } + + // self-attention + { + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_arcee : public llm_graph_context { llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -14635,6 +14808,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_ERNIE4_5: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } @@ -14786,6 +14963,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_BAILINGMOE: case LLM_ARCH_NEO_BERT: case LLM_ARCH_ARCEE: + case LLM_ARCH_ERNIE4_5: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/src/llama-model.h b/src/llama-model.h index 40063b790..a958c5997 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -39,6 +39,7 @@ enum llm_type { LLM_TYPE_475M, LLM_TYPE_770M, LLM_TYPE_780M, + LLM_TYPE_0_3B, LLM_TYPE_0_5B, LLM_TYPE_0_6B, LLM_TYPE_1B, From 00d5282c7f2a0bb05bb315fb81ca3b0f42cf9f07 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 28 Jun 2025 10:17:09 -0500 Subject: [PATCH 27/54] vulkan: lock accesses of pinned_memory vector (#14333) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 99be5e45b..e42f115d0 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -305,7 +305,7 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& } struct vk_device_struct { - std::mutex mutex; + std::recursive_mutex mutex; vk::PhysicalDevice physical_device; vk::PhysicalDeviceProperties properties; @@ -1197,7 +1197,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin } { - std::lock_guard guard(device->mutex); + std::lock_guard guard(device->mutex); device->pipelines.insert({ pipeline->name, pipeline }); } @@ -1411,7 +1411,7 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector guard(device->mutex); + std::lock_guard guard(device->mutex); q.queue_family_index = queue_family_index; q.transfer_only = transfer_only; @@ -4124,6 +4124,7 @@ static void * ggml_vk_host_malloc(vk_device& device, size_t size) { return nullptr; } + std::lock_guard guard(device->mutex); device->pinned_memory.push_back(std::make_tuple(buf->ptr, size, buf)); return buf->ptr; @@ -4134,6 +4135,8 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) { return; } VK_LOG_MEMORY("ggml_vk_host_free(" << ptr << ")"); + std::lock_guard guard(device->mutex); + vk_buffer buf; size_t index; for (size_t i = 0; i < device->pinned_memory.size(); i++) { @@ -4156,6 +4159,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) { } static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) { + std::lock_guard guard(device->mutex); buf = nullptr; buf_offset = 0; for (size_t i = 0; i < device->pinned_memory.size(); i++) { @@ -4457,7 +4461,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); } } else { - std::lock_guard guard(dst->device->mutex); + std::lock_guard guard(dst->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); @@ -4548,7 +4552,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_ memcpy(dst, (uint8_t *) src->ptr + offset, size); } else { - std::lock_guard guard(src->device->mutex); + std::lock_guard guard(src->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(src->device, subctx); @@ -4578,7 +4582,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { if (src->device == dst->device) { - std::lock_guard guard(src->device->mutex); + std::lock_guard guard(src->device->mutex); VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")"); // Copy within the device vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool); @@ -4613,7 +4617,7 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) { VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")"); - std::lock_guard guard(dst->device->mutex); + std::lock_guard guard(dst->device->mutex); vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool); ggml_vk_ctx_begin(dst->device, subctx); subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); From 63a7bb3c7e1c6b0a92d03b0a594d3cd501d6ed3e Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 28 Jun 2025 10:36:40 -0500 Subject: [PATCH 28/54] vulkan: handle noncontig in the final case of ggml_vk_get_cpy_pipeline (#14378) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index e42f115d0..996ccbf66 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4844,9 +4844,17 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const // type size must be exactly 2 or 4. GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4); if ((ggml_type_size(src->type) % 4) == 0) { - return ctx->device->pipeline_contig_cpy_f32_f32; + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f32; + } else { + return ctx->device->pipeline_cpy_f32_f32; + } } else { - return ctx->device->pipeline_contig_cpy_f16_f16; + if (contig) { + return ctx->device->pipeline_contig_cpy_f16_f16; + } else { + return ctx->device->pipeline_cpy_f16_f16; + } } } @@ -4907,7 +4915,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT const uint64_t ne00 = src0->ne[0]; @@ -5135,7 +5143,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << (dryrun ? "dryrun" : "") << "),)"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT const uint64_t ne00 = src0->ne[0]; @@ -5736,7 +5744,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3]; std::cerr << "), " << (dryrun ? "dryrun" : "") << ")"); - GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); // NOLINT GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT GGML_ASSERT(ids->type == GGML_TYPE_I32); From 27208bf657cfe7262791df473927225e48efe482 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 29 Jun 2025 01:30:53 +0800 Subject: [PATCH 29/54] CUDA: add bf16 and f32 support to cublas_mul_mat_batched (#14361) * CUDA: add bf16 and f32 support to cublas_mul_mat_batched * Review: add type traits and make function more generic * Review: make check more explicit, add back comments, and fix formatting * Review: fix formatting, remove useless type conversion, fix naming for bools --- ggml/src/ggml-cuda/convert.cu | 22 ++++ ggml/src/ggml-cuda/convert.cuh | 5 + ggml/src/ggml-cuda/ggml-cuda.cu | 207 ++++++++++++++++++++------------ tests/test-backend-ops.cpp | 6 +- 4 files changed, 162 insertions(+), 78 deletions(-) diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index c6dec4276..eeaa14bf5 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -728,3 +728,25 @@ to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type) { return nullptr; } } + +to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return convert_unary_cuda; + case GGML_TYPE_F16: + return convert_unary_cuda; + default: + return nullptr; + } +} + +to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_F16: + return convert_unary_cuda; + case GGML_TYPE_BF16: + return convert_unary_cuda; + default: + return nullptr; + } +} diff --git a/ggml/src/ggml-cuda/convert.cuh b/ggml/src/ggml-cuda/convert.cuh index b65b98e08..f04214be1 100644 --- a/ggml/src/ggml-cuda/convert.cuh +++ b/ggml/src/ggml-cuda/convert.cuh @@ -22,5 +22,10 @@ using to_t_nc_cuda_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, int64_t s01, int64_t s02, int64_t s03, cudaStream_t stream); +typedef to_t_nc_cuda_t to_fp32_nc_cuda_t; typedef to_t_nc_cuda_t to_fp16_nc_cuda_t; +typedef to_t_nc_cuda_t to_bf16_nc_cuda_t; + +to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type); to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type); +to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b30c13c62..811422f38 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1749,7 +1749,7 @@ static void ggml_cuda_op_mul_mat( } static __global__ void k_compute_batched_ptrs( - const half * src0_as_f16, const half * src1_as_f16, char * dst, + const void * src0_as_f16, const void * src1_as_f16, char * dst, const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23, @@ -1772,83 +1772,131 @@ static __global__ void k_compute_batched_ptrs( ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } -static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +// Type traits for mapping ggml types to CUDA/cuBLAS types +template +struct batched_mul_mat_traits; + +template<> +struct batched_mul_mat_traits { + using cuda_type = float; + static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + static inline const cudaDataType_t data_type = CUDA_R_32F; + static inline const ggml_type ggml_type_val = GGML_TYPE_F32; + static inline const float alpha = 1.0f; + static inline const float beta = 0.0f; + static inline const void* get_alpha() { static const float val = alpha; return &val; } + static inline const void* get_beta() { static const float val = beta; return &val; } + static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp32_nc_cuda(src_type); } +}; + +template<> +struct batched_mul_mat_traits { + using cuda_type = nv_bfloat16; + static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + static inline const cudaDataType_t data_type = CUDA_R_16BF; + static inline const ggml_type ggml_type_val = GGML_TYPE_BF16; + static inline const float alpha = 1.0f; + static inline const float beta = 0.0f; + static inline const void* get_alpha() { static const float val = alpha; return &val; } + static inline const void* get_beta() { static const float val = beta; return &val; } + static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_bf16_nc_cuda(src_type); } +}; + +template<> +struct batched_mul_mat_traits { + using cuda_type = half; + static inline const cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F; + static inline const cudaDataType_t data_type = CUDA_R_16F; + static inline const ggml_type ggml_type_val = GGML_TYPE_F16; + static inline const half alpha = 1.0; + static inline const half beta = 0.0; + static inline const void* get_alpha() { static const half val = alpha; return &val; } + static inline const void* get_beta() { static const half val = beta; return &val; } + static inline auto get_nc_converter(ggml_type src_type) { return ggml_get_to_fp16_nc_cuda(src_type); } +}; + +template +static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + using traits = batched_mul_mat_traits; + using cuda_t = typename traits::cuda_type; + GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft)); - GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == src0_type); + GGML_ASSERT(ggml_is_contiguous(dst)); // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst. // As long as dst is contiguous this does not matter though. - GGML_ASSERT(ggml_is_contiguous(dst)); GGML_TENSOR_BINARY_OP_LOCALS const int64_t ne_dst = ggml_nelements(dst); - cudaStream_t main_stream = ctx.stream(); - CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); - const half * src0_f16 = (const half *) src0->data; float * dst_ddf = (float *) dst->data; - - const half * src1_f16 = (const half *) src1->data; const size_t ts_src1 = ggml_type_size(src1->type); GGML_ASSERT(nb10 == ts_src1); int64_t s11 = nb11 / ts_src1; int64_t s12 = nb12 / ts_src1; int64_t s13 = nb13 / ts_src1; - ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); - // convert src1 to fp16 - if (src1->type != GGML_TYPE_F16) { - const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda(src1->type); + const cuda_t * src0_ptr = nullptr; + const cuda_t * src1_ptr = nullptr; + + ggml_cuda_pool_alloc src0_alloc(ctx.pool()); + ggml_cuda_pool_alloc src1_alloc(ctx.pool()); + + // Handle src0 + src0_ptr = (const cuda_t *) src0->data; + + // Handle src1 - convert if necessary + if (src1->type == src0_type) { + src1_ptr = (const cuda_t *) src1->data; + } else { + // Convert src1 to target type using traits conversion functions const int64_t ne_src1 = ggml_nelements(src1); - src1_f16_alloc.alloc(ne_src1); - GGML_ASSERT(to_fp16_cuda != nullptr); + src1_alloc.alloc(ne_src1); - to_fp16_cuda(src1_f16, src1_f16_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); - - src1_f16 = src1_f16_alloc.get(); + const auto convert_func = traits::get_nc_converter(src1->type); + GGML_ASSERT(convert_func != nullptr); + convert_func(src1->data, src1_alloc.get(), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream); + src1_ptr = src1_alloc.get(); s11 = ne10; s12 = ne11*s11; s13 = ne12*s12; } - ggml_cuda_pool_alloc dst_f16(ctx.pool()); + // Setup destination buffer + ggml_cuda_pool_alloc dst_temp(ctx.pool()); char * dst_t; - - cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; - cudaDataType_t cu_data_type = CUDA_R_16F; - - // dst strides size_t nbd2 = dst->nb[2]; size_t nbd3 = dst->nb[3]; - const half alpha_f16 = 1.0f; - const half beta_f16 = 0.0f; - + cublasComputeType_t cu_compute_type = traits::compute_type; + cudaDataType_t cu_data_type = traits::data_type; + cudaDataType_t cu_data_type_a = traits::data_type; + cudaDataType_t cu_data_type_b = traits::data_type; + const void * alpha = traits::get_alpha(); + const void * beta = traits::get_beta(); const float alpha_f32 = 1.0f; - const float beta_f32 = 0.0f; - - const void * alpha = &alpha_f16; - const void * beta = &beta_f16; + const float beta_f32 = 0.0f; if (dst->op_params[0] == GGML_PREC_DEFAULT) { - dst_t = (char *) dst_f16.alloc(ne_dst); - - nbd2 /= sizeof(float) / sizeof(half); - nbd3 /= sizeof(float) / sizeof(half); + if constexpr (src0_type == GGML_TYPE_F32) { + dst_t = (char *) dst_ddf; // Direct F32 output + } else { + dst_t = (char *) dst_temp.alloc(ne_dst); + nbd2 /= sizeof(float) / sizeof(cuda_t); + nbd3 /= sizeof(float) / sizeof(cuda_t); + } } else { dst_t = (char *) dst_ddf; - cu_compute_type = CUBLAS_COMPUTE_32F; - cu_data_type = CUDA_R_32F; - + cu_data_type = CUDA_R_32F; alpha = &alpha_f32; - beta = &beta_f32; + beta = &beta_f32; } int id = ggml_cuda_get_device(); @@ -1856,7 +1904,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { cu_compute_type = CUBLAS_COMPUTE_32F; alpha = &alpha_f32; - beta = &beta_f32; + beta = &beta_f32; } GGML_ASSERT(ne12 % ne02 == 0); @@ -1866,35 +1914,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co const int64_t r2 = ne12/ne02; const int64_t r3 = ne13/ne03; -#if 0 - // use cublasGemmEx - { - for (int i13 = 0; i13 < ne13; ++i13) { - for (int i12 = 0; i12 < ne12; ++i12) { - int i03 = i13 / r3; - int i02 = i12 / r2; - - CUBLAS_CHECK( - cublasGemmEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02, CUDA_R_16F, nb01/sizeof(half), - src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11, - beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0, - cu_compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - } - } -#else if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) { // there is no broadcast and src0, src1 are contiguous across dims 2, 3 // use cublasGemmStridedBatchedEx CUBLAS_CHECK( cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA - src1_f16, CUDA_R_16F, s11, s12, // strideB - beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC + alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA + src1_ptr, cu_data_type_b, s11, s12, // strideB + beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC ne12*ne13, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); @@ -1905,34 +1933,55 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co ggml_cuda_pool_alloc ptrs_src(ctx.pool(), 2*ne23); ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23); + size_t src1_stride_size = sizeof(cuda_t); + dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( - src0_f16, src1_f16, dst_t, + src0_ptr, src1_ptr, dst_t, ptrs_src.get(), ptrs_dst.get(), ne12, ne13, ne23, nb02, nb03, - src1->type == GGML_TYPE_F16 ? nb12 : s12*sizeof(half), - src1->type == GGML_TYPE_F16 ? nb13 : s13*sizeof(half), + (src1->type == src0_type) ? nb12 : s12*src1_stride_size, + (src1->type == src0_type) ? nb13 : s13*src1_stride_size, nbd2, nbd3, r2, r3); + CUDA_CHECK(cudaGetLastError()); CUBLAS_CHECK( cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, - alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00, - (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, s11, - beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0, + alpha, (const void **) (ptrs_src.get() + 0*ne23), cu_data_type_a, nb01/nb00, + (const void **) (ptrs_src.get() + 1*ne23), cu_data_type_b, s11, + beta, ( void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne0, ne23, cu_compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } -#endif - if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type == CUDA_R_16F) { - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); - to_fp32_cuda(dst_f16.get(), dst_ddf, ne_dst, main_stream); + // Convert output back to F32 if needed + if (dst->op_params[0] == GGML_PREC_DEFAULT && cu_data_type != CUDA_R_32F) { + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(traits::ggml_type_val); + to_fp32_cuda(dst_temp.get(), dst_ddf, ne_dst, main_stream); + } +} + +static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F32); + + switch (src0->type) { + case GGML_TYPE_F32: + ggml_cuda_mul_mat_batched_cublas_impl(ctx, src0, src1, dst); + break; + case GGML_TYPE_BF16: + ggml_cuda_mul_mat_batched_cublas_impl(ctx, src0, src1, dst); + break; + case GGML_TYPE_F16: + ggml_cuda_mul_mat_batched_cublas_impl(ctx, src0, src1, dst); + break; + default: + GGML_ABORT("Unsupported type"); } } @@ -1984,6 +2033,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); + //TODO update for generic tensor parallelism + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + bool use_batched_cublas_f16 = src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16); + bool use_batched_cublas_bf16 = src0->type == GGML_TYPE_BF16 && bf16_mma_hardware_available(cc); + bool use_batched_cublas_f32 = src0->type == GGML_TYPE_F32; + if (!split && use_mul_mat_vec) { // the custom F16 vector kernel can be used over batched cuBLAS GEMM // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention) @@ -1992,8 +2047,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); } else if (!split && use_mul_mat_q) { ggml_cuda_mul_mat_q(ctx, src0, src1, nullptr, dst); - } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) && - !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32) + && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index a233f1f2f..128d63988 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4425,8 +4425,10 @@ static std::vector> make_test_cases_eval() { for (auto nr : {1,4}) { for (uint32_t m = 0; m < 2; ++m) { for (uint32_t k = 0; k < 2; ++k) { - test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true)); + for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) { + test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, 1}, {nr, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, 1}, {nr, 1}, {0, 1, 2, 3}, true)); + } } } } From bd9c981d7226107f18deb8344c3301450311bb8b Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 29 Jun 2025 02:43:36 -0500 Subject: [PATCH 30/54] vulkan: Add fusion support for RMS_NORM+MUL (#14366) * vulkan: Add fusion support for RMS_NORM+MUL - Add a use_count to ggml_tensor, so we can detect if an output is used more than once. - Change the ggml-vulkan rms_norm shader to optionally multiply by another tensor. - Add detection logic and basic fusion logic in ggml-vulkan. - Add some testing support for fusion. Rather than computing one node at a time, allow for computing the whole graph and just testing one node's results. Add rms_norm_mul tests and enable a llama test. * extract some common fusion logic * fix -Winconsistent-missing-override * move ggml_can_fuse to a common function * build fix * C and C++ versions of can_fuse * move use count to the graph to avoid data races and double increments when used in multiple threads * use hash table lookup to find node index * change use_counts to be indexed by hash table slot * minimize hash lookups style fixes * last node doesn't need single use. fix type. handle mul operands being swapped. * remove redundant parameter --------- Co-authored-by: slaren --- ggml/include/ggml-backend.h | 2 +- ggml/src/ggml-backend.cpp | 58 ++++++++++----- ggml/src/ggml-impl.h | 64 ++++++++++++++++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 58 +++++++++++---- .../ggml-vulkan/vulkan-shaders/rms_norm.comp | 15 +++- .../vulkan-shaders/vulkan-shaders-gen.cpp | 2 +- ggml/src/ggml.c | 46 ++++++++---- tests/test-backend-ops.cpp | 74 ++++++++++++++++++- 8 files changed, 263 insertions(+), 56 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 778927f68..a2977ea2e 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -339,7 +339,7 @@ extern "C" { typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends - GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); + GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node); // Tensor initialization GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index b1050ad59..788861a36 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -817,8 +817,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } if (sched->debug > 1) { ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, - fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); + GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name, + fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node), + graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -1826,7 +1827,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { ggml_free(copy.ctx_unallocated); } -bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { +bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor * test_node) { struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); if (copy.buffer == NULL) { return false; @@ -1837,28 +1838,45 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t assert(g1->n_nodes == g2->n_nodes); - for (int i = 0; i < g1->n_nodes; i++) { - struct ggml_tensor * t1 = g1->nodes[i]; - struct ggml_tensor * t2 = g2->nodes[i]; + if (test_node != nullptr) { + // Compute the whole graph and only test the output for a specific tensor + ggml_backend_graph_compute(backend1, g1); + ggml_backend_graph_compute(backend2, g2); - assert(t1->op == t2->op && ggml_are_same_layout(t1, t2)); - - struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); - struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); - - ggml_backend_graph_compute(backend1, &g1v); - ggml_backend_graph_compute(backend2, &g2v); - - if (ggml_is_view_op(t1->op)) { - continue; + int test_node_idx = -1; + for (int i = 0; i < g1->n_nodes; i++) { + struct ggml_tensor * t1 = g1->nodes[i]; + if (t1 == test_node) { + test_node_idx = i; + break; + } } + GGML_ASSERT(test_node_idx != -1); - // compare results, calculate rms etc - if (!callback(i, t1, t2, user_data)) { - break; + callback(test_node_idx, g1->nodes[test_node_idx], g2->nodes[test_node_idx], user_data); + } else { + for (int i = 0; i < g1->n_nodes; i++) { + struct ggml_tensor * t1 = g1->nodes[i]; + struct ggml_tensor * t2 = g2->nodes[i]; + + assert(t1->op == t2->op && ggml_are_same_layout(t1, t2)); + + struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1); + struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1); + + ggml_backend_graph_compute(backend1, &g1v); + ggml_backend_graph_compute(backend2, &g2v); + + if (ggml_is_view_op(t1->op)) { + continue; + } + + // compare results, calculate rms etc + if (!callback(i, t1, t2, user_data)) { + break; + } } } - ggml_backend_graph_copy_free(copy); return true; diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 57761644f..4972558c9 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -301,6 +301,7 @@ struct ggml_cgraph { struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes struct ggml_tensor ** grad_accs; // accumulators for node gradients struct ggml_tensor ** leafs; // tensors with constant data + int32_t * use_counts;// number of uses of each tensor, indexed by hash table slot struct ggml_hash_set visited_hash_set; @@ -467,13 +468,76 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) +// return true if the node's results are only used by N other nodes +// and can be fused into their calculations. +static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) { + const struct ggml_tensor * node = cgraph->nodes[node_idx]; + + // check the use count against how many we're replacing + size_t hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node); + if (!ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos) || cgraph->use_counts[hash_pos] != n_uses) { + return false; + } + + // if node is a view, some other node might be using the intermediate result + // via the view source. + if (node->view_src) { + return false; + } + + // If the user requested output for the node, can't fuse + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { + return false; + } + + return true; +} + +// Returns true if nodes [i, i+ops.size()) are the sequence of ggml_ops in ops[] +// and are fusable. Nodes are considered fusable according to this function if: +// - all nodes except the last have only one use and are not views/outputs (see ggml_node_has_N_uses). +// - all nodes except the last are a src of the following node. +// - all nodes are the same shape. +// TODO: Consider allowing GGML_OP_NONE nodes in between +static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, const enum ggml_op * ops, int num_ops) { + if (node_idx + num_ops > cgraph->n_nodes) { + return false; + } + + for (int i = 0; i < num_ops; ++i) { + struct ggml_tensor * node = cgraph->nodes[node_idx + i]; + if (node->op != ops[i]) { + return false; + } + if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idx + i, 1)) { + return false; + } + if (i > 0) { + struct ggml_tensor * prev = cgraph->nodes[node_idx + i - 1]; + if (node->src[0] != prev && node->src[1] != prev) { + return false; + } + if (!ggml_are_same_shape(node, prev)) { + return false; + } + } + } + return true; +} + #ifdef __cplusplus } #endif #ifdef __cplusplus +#include #include +// nicer C++ syntax for ggml_can_fuse +inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::initializer_list ops) { + return ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size()); +} + // expose GGUF internals for test code GGML_API size_t gguf_type_size(enum gguf_type type); GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 996ccbf66..aebcc0391 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -425,6 +425,7 @@ struct vk_device_struct { vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; + vk_pipeline pipeline_rms_norm_mul_f32; vk_pipeline pipeline_rms_norm_back_f32; vk_pipeline pipeline_l2_norm_f32; @@ -978,6 +979,10 @@ struct ggml_backend_vk_context { vk_command_pool compute_cmd_pool; vk_command_pool transfer_cmd_pool; + + // number of additional consecutive nodes that are being fused with the + // node currently being processed + uint32_t num_additional_fused_ops {}; }; static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT @@ -2655,7 +2660,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_f32, "rms_norm_mul_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); @@ -6430,7 +6436,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_rms_norm_f32; + return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32; } return nullptr; case GGML_OP_RMS_NORM_BACK: @@ -7530,18 +7536,19 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun); } -static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { +static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { float * op_params = (float *)dst->op_params; const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)ggml_nelements(src0), - (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, - (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - op_params[0], 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + op_params[0], 0.0f, 0, }, dryrun); } @@ -8736,7 +8743,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* t // Returns true if node has enqueued work into the queue, false otherwise // If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution. -static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){ +static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool almost_ready, bool submit){ + ggml_tensor * node = cgraph->nodes[node_idx]; if (ggml_is_empty(node) || !node->buffer) { return false; } @@ -8974,8 +8982,14 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; case GGML_OP_RMS_NORM: - ggml_vk_rms_norm(ctx, compute_ctx, src0, node, dryrun); - + if (ctx->num_additional_fused_ops > 0) { + // fused rms_norm + mul + ggml_tensor *mul = cgraph->nodes[node_idx + 1]; + ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0]; + ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, dryrun); + } else { + ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, dryrun); + } break; case GGML_OP_RMS_NORM_BACK: ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun); @@ -9710,10 +9724,15 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false); + if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + } + ggml_vk_build_graph(ctx, cgraph, i, nullptr, 0, true, false, false, false); if (cgraph->nodes[i]->op == GGML_OP_MUL_MAT || cgraph->nodes[i]->op == GGML_OP_MUL_MAT_ID) { total_mat_mul_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } + i += ctx->num_additional_fused_ops; + ctx->num_additional_fused_ops = 0; } if (ctx->device->need_compiles) { ggml_vk_load_shaders(ctx->device); @@ -9775,14 +9794,18 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg mul_mat_bytes += ggml_nbytes(cgraph->nodes[i]->src[0]); } + if (ggml_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + ctx->num_additional_fused_ops = 1; + } + // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; bool submit = (submitted_nodes >= nodes_per_submit) || (mul_mat_bytes >= mul_mat_bytes_per_submit) || - (i == last_node) || + (i + ctx->num_additional_fused_ops == last_node) || (almost_ready && !ctx->almost_ready_fence_pending); - bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit); + bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i + ctx->num_additional_fused_ops == last_node, almost_ready, submit); if (vk_perf_logger_enabled) { if (ctx->compute_ctx.expired()) { @@ -9792,7 +9815,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } else { compute_ctx = ctx->compute_ctx.lock(); } - compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1); + // If there are fused ops, just write out timestamps for all nodes to keep the accounting simple + for (int j = 0; j < ctx->num_additional_fused_ops + 1; ++j) { + compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+j+1); + } } if (enqueued) { @@ -9814,6 +9840,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg } submit_count++; } + i += ctx->num_additional_fused_ops; + ctx->num_additional_fused_ops = 0; } if (vk_perf_logger_enabled) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp index deb8ee996..6428ca7ba 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp @@ -1,11 +1,13 @@ #version 450 -#include "generic_unary_head.comp" +#include "generic_binary_head.comp" #include "types.comp" #extension GL_EXT_control_flow_attributes : enable #define BLOCK_SIZE 512 +layout (constant_id = 1) const bool do_multiply = false; + layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; shared FLOAT_TYPE sum[BLOCK_SIZE]; @@ -25,6 +27,7 @@ void main() { const uint stride_sample = p.nb03; uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset(); + uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset(); uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset(); sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp @@ -46,7 +49,13 @@ void main() { const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(ncols); const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1)); - [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { - data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); + if (do_multiply) { + [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col])); + } + } else { + [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); + } } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index c63345ec8..a207b98c6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -497,7 +497,7 @@ void process_shaders() { // Norms string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); - string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 3d04f80ef..1262236c0 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5841,19 +5841,32 @@ static void ggml_compute_backward( GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2])); } -static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { +static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { // check if already visited - if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) { - return; + size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node); + GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL); + if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) { + // This is the first time we see this node in the current graph. + cgraph->visited_hash_set.keys[node_hash_pos] = node; + ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos); + cgraph->use_counts[node_hash_pos] = 0; + } else { + // already visited + return node_hash_pos; } for (int i = 0; i < GGML_MAX_SRC; ++i) { const int k = (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i : (cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) : - /* unknown order, just fall back to using i*/ i; - if (node->src[k]) { - ggml_visit_parents(cgraph, node->src[k]); + /* unknown order, just fall back to using i */ i; + + struct ggml_tensor * src = node->src[k]; + if (src) { + size_t src_hash_pos = ggml_visit_parents(cgraph, src); + + // Update the use count for this operand. + cgraph->use_counts[src_hash_pos]++; } } @@ -5877,6 +5890,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * cgraph->nodes[cgraph->n_nodes] = node; cgraph->n_nodes++; } + + return node_hash_pos; } static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) { @@ -6014,6 +6029,7 @@ static size_t ggml_graph_nbytes(size_t size, bool grads) { incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1); incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs + incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t)); // use_counts incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys if (grads) { incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads @@ -6043,11 +6059,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz void * p = cgraph + 1; - struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); - struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); - struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); - struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; - struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; + struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + int32_t * use_counts_ptr = incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t)); + struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); + struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; + struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL; ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t)); @@ -6062,6 +6079,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.grads =*/ grads_ptr, /*.grad_accs =*/ grad_accs_ptr, /*.leafs =*/ leafs_ptr, + /*.use_counts =*/ use_counts_ptr, /*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, }; @@ -6088,7 +6106,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.grads =*/ NULL, // gradients would need visited_hash_set /*.grad_accs =*/ NULL, /*.leafs =*/ NULL, - /*.visited_hash_set =*/ { 0, NULL, NULL }, + /*.use_counts =*/ cgraph0->use_counts, + /*.visited_hash_set =*/ cgraph0->visited_hash_set, /*.order =*/ cgraph0->order, }; @@ -6115,7 +6134,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) { for (size_t i = 0; i < src->visited_hash_set.size; ++i) { // copy all hashset keys (tensors) that are in use if (ggml_bitset_get(src->visited_hash_set.used, i)) { - ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); + size_t new_hash_pos = ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]); + dst->use_counts[new_hash_pos] = src->use_counts[i]; } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 128d63988..ec088bae2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -382,6 +382,8 @@ struct test_case { return 0; } + virtual bool run_whole_graph() { return false; } + ggml_cgraph * gf = nullptr; ggml_cgraph * gb = nullptr; @@ -574,7 +576,7 @@ struct test_case { GGML_UNUSED(index); }; - const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); + const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud, run_whole_graph() ? out : nullptr); if (!cmp_ok) { printf("compare failed "); @@ -1896,6 +1898,63 @@ struct test_rms_norm_back : public test_case { } }; +// GGML_OP_RMS_NORM + GGML_OP_MUL +struct test_rms_norm_mul : public test_case { + const ggml_type type; + const std::array ne; + const float eps; + + std::string op_desc(ggml_tensor * t) override { + GGML_UNUSED(t); + return "RMS_NORM_MUL"; + } + + bool run_whole_graph() override { return true; } + + std::string vars() override { + return VARS_TO_STR3(type, ne, eps); + } + + test_rms_norm_mul(ggml_type type = GGML_TYPE_F32, + std::array ne = {64, 5, 4, 3}, + float eps = 1e-6f) + : type(type), ne(ne), eps(eps) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(a); + ggml_set_name(a, "a"); + ggml_set_param(b); + ggml_set_name(b, "b"); + + // Use a and b early, so we don't end up with an OP_NONE between rms_norm and mul + a = ggml_add(ctx, a, b); + ggml_tensor * out = ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.f, 10.f); + } + } + + double max_nmse_err() override { + return 1e-6; + } + + float grad_eps() override { + return 1.0f; + } + + bool grad_precise() override { + return true; + } +}; + // GGML_OP_SSM_CONV struct test_ssm_conv : public test_case { const ggml_type type; @@ -3736,6 +3795,7 @@ struct test_llama : public test_llm { static constexpr float attn_factor = 1.0f; static constexpr float beta_fast = 32.0f; static constexpr float beta_slow = 1.0f; + bool fused; std::string op_desc(ggml_tensor * t) override { GGML_UNUSED(t); @@ -3751,7 +3811,9 @@ struct test_llama : public test_llm { return 2e-3; } - test_llama(int n_tokens = 1) + bool run_whole_graph() override { return fused; } + + test_llama(int n_tokens = 1, bool fused = false) : test_llm({ /*n_vocab =*/ 32000, /*n_embd =*/ 3200, @@ -3763,7 +3825,9 @@ struct test_llama : public test_llm { /*f_norm_eps =*/ 0.f, /*f_norm_rms_eps =*/ 1e-5f, /*n_tokens =*/ n_tokens, - }) { + }) + , fused(fused) + { } ggml_tensor * build_graph(ggml_context * ctx) override { @@ -4306,6 +4370,9 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); } + for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f}) { + test_cases.emplace_back(new test_rms_norm_mul(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); + } test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f)); @@ -4677,6 +4744,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3})); + test_cases.emplace_back(new test_llama(2, true)); // these tests are disabled to save execution time, but they can be handy for debugging #if 0 test_cases.emplace_back(new test_llama(1)); From a0535ffa0d35fccfec3e1a0a3bfc9dbb6054d7c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 29 Jun 2025 11:04:10 +0200 Subject: [PATCH 31/54] ggml : implement REGLU/GEGLU/SWIGLU ops (#14158) * implement unary REGLU/GEGLU/SWIGLU cpu ops * relax constraints * duplicate shape of source * fix ggml_vec_geglu_f16 * special case gated ops * implement unary REGLU/GEGLU/SWIGLU cuda ops * tighten constraints again * refactor into GGML_GLU_OP * metal : add glu kernels ggml-ci * add CUDA_GLU_BLOCK_SIZE [no ci] * more constraints and use 64bit ints ggml-ci * 64bit multiplication [no ci] * implement swapped variants (cpu/cuda) * update comment [no ci] ggml-ci * Vulkan: Add GLU ops and shaders * SYCL: Implement fused kernel GEGLU, SWIGLU and REGLU for single up+gate * ggml : implement GLU for split up/gate (#14181) * implement GLU for split up/gate * add tests for ggml_glu_split * Vulkan: Implement glu_split logic and shader support * add split to logging [no ci] * SYCL: refactor element_size ops and add split up and gate support to gated kernels * SYCL: switch GEGLU to use tanh approximation --------- Co-authored-by: 0cc4m Co-authored-by: Akarshan * GGML: increase OP count in assertion * Refactor: Optimize SYCL element-wise operations with unary function inlining This commit refactors the SYCL element-wise operations to improve performance by: - Inlining unary operations (sgn, abs, elu, gelu, silu, etc.) to reduce kernel launch overhead. - Introducing helper functions `op_xxx` for each unary operation to encapsulate the logic. - Replacing direct kernel calls with calls to these inlined functions. - Using `__dpct_inline__` to encourage compiler inlining. - Minor code cleanup and consistency improvements. The changes aim to reduce kernel launch overhead and improve the overall efficiency of element-wise operations on SYCL devices. * vulkan: Increase workgroup size for GLU, for performance (#14345) * vulkan: Increase workgroup size for GLU, for performance * vulkan: change GLU shaders to do one element per invocation rather than one row per workgroup * merge fix * metal : add support for split and swap ggml-ci --------- Co-authored-by: Georgi Gerganov Co-authored-by: 0cc4m Co-authored-by: Akarshan Co-authored-by: Jeff Bolz --- ggml/include/ggml.h | 69 + ggml/src/ggml-cpu/ggml-cpu.c | 16 + ggml/src/ggml-cpu/ops.cpp | 457 +++++ ggml/src/ggml-cpu/ops.h | 1 + ggml/src/ggml-cpu/vec.cpp | 24 + ggml/src/ggml-cpu/vec.h | 54 + ggml/src/ggml-cuda/ggml-cuda.cu | 25 + ggml/src/ggml-cuda/unary.cu | 89 + ggml/src/ggml-cuda/unary.cuh | 7 + ggml/src/ggml-metal/ggml-metal-impl.h | 11 + ggml/src/ggml-metal/ggml-metal.m | 71 + ggml/src/ggml-metal/ggml-metal.metal | 64 + ggml/src/ggml-sycl/element_wise.cpp | 1825 +++++++---------- ggml/src/ggml-sycl/element_wise.hpp | 25 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 25 + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 117 +- .../src/ggml-vulkan/vulkan-shaders/geglu.comp | 13 + .../ggml-vulkan/vulkan-shaders/glu_head.comp | 15 + .../ggml-vulkan/vulkan-shaders/glu_main.comp | 29 + .../src/ggml-vulkan/vulkan-shaders/reglu.comp | 9 + .../ggml-vulkan/vulkan-shaders/swiglu.comp | 9 + .../vulkan-shaders/vulkan-shaders-gen.cpp | 7 + ggml/src/ggml.c | 138 +- src/llama-graph.cpp | 62 +- src/llama-graph.h | 1 + tests/test-backend-ops.cpp | 116 ++ 26 files changed, 2126 insertions(+), 1153 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 2b1bd6e0f..e5dda969a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -520,6 +520,8 @@ extern "C" { GGML_OP_CROSS_ENTROPY_LOSS_BACK, GGML_OP_OPT_STEP_ADAMW, + GGML_OP_GLU, + GGML_OP_COUNT, }; @@ -543,6 +545,14 @@ extern "C" { GGML_UNARY_OP_COUNT, }; + enum ggml_glu_op { + GGML_GLU_OP_REGLU, + GGML_GLU_OP_GEGLU, + GGML_GLU_OP_SWIGLU, + + GGML_GLU_OP_COUNT, + }; + enum ggml_object_type { GGML_OBJECT_TYPE_TENSOR, GGML_OBJECT_TYPE_GRAPH, @@ -658,6 +668,7 @@ extern "C" { GGML_API const char * ggml_op_symbol(enum ggml_op op); GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); + GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op); GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); @@ -762,6 +773,7 @@ extern "C" { GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); @@ -1090,6 +1102,63 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // gated linear unit ops + // A: n columns, r rows, + // result is n / 2 columns, r rows, + // expects gate in second half of row, unless swapped is true + GGML_API struct ggml_tensor * ggml_glu( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_glu_op op, + bool swapped); + + GGML_API struct ggml_tensor * ggml_reglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_reglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_swiglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_swiglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // A: n columns, r rows, + // B: n columns, r rows, + GGML_API struct ggml_tensor * ggml_glu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_glu_op op); + + GGML_API struct ggml_tensor * ggml_reglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_geglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_swiglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + // normalize along rows GGML_API struct ggml_tensor * ggml_norm( struct ggml_context * ctx, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 2042ee71f..1d68cde71 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1949,6 +1949,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_unary(params, tensor); } break; + case GGML_OP_GLU: + { + ggml_compute_forward_glu(params, tensor); + } break; case GGML_OP_GET_REL_POS: { ggml_compute_forward_get_rel_pos(params, tensor); @@ -2159,6 +2163,18 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { GGML_ABORT("fatal error"); } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + { + n_tasks = n_threads; + } break; + default: + GGML_ABORT("fatal error"); + } + break; case GGML_OP_SILU_BACK: case GGML_OP_MUL: case GGML_OP_DIV: diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 9f17ea43c..27586ed1f 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -3184,6 +3184,435 @@ void ggml_compute_forward_silu_back( } } +// ggml_compute_forward_reglu + +static void ggml_compute_forward_reglu_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * src0_p = (float *) (src0_d + i1*src0_o); + float * src1_p = (float *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_reglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_reglu_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o); + ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_reglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_reglu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_reglu_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_reglu_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_geglu + +static void ggml_compute_forward_geglu_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * src0_p = (float *) (src0_d + i1*src0_o); + float * src1_p = (float *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_geglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_geglu_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o); + ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_geglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_geglu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_geglu_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_geglu_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + +// ggml_compute_forward_swiglu + +static void ggml_compute_forward_swiglu_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + float * src0_p = (float *) (src0_d + i1*src0_o); + float * src1_p = (float *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_swiglu_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_swiglu_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + char * src0_d = (char *) src0->data; + char * src1_d = (char *) (src1 ? src1->data : src0->data); + const size_t src0_o = src0->nb[1]; + const size_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(ggml_is_contiguous_1(dst)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src0->type == src1->type); + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + const int nr = ggml_nrows(src0); + + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == nr); + + const int32_t swapped = ggml_get_op_params_i32(dst, 1); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_fp16_t * src0_p = (ggml_fp16_t *) (src0_d + i1*src0_o); + ggml_fp16_t * src1_p = (ggml_fp16_t *) (src1_d + i1*src1_o); + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + ggml_vec_swiglu_f16(nc, (ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])), src0_p, src1_p); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_swiglu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_swiglu_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_swiglu_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_norm static void ggml_compute_forward_norm_f32( @@ -8052,6 +8481,34 @@ void ggml_compute_forward_unary( } } +//ggml_compute_forward_glu + +void ggml_compute_forward_glu( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_glu_op op = ggml_get_glu_op(dst); + + switch (op) { + case GGML_GLU_OP_REGLU: + { + ggml_compute_forward_reglu(params, dst); + } break; + case GGML_GLU_OP_GEGLU: + { + ggml_compute_forward_geglu(params, dst); + } break; + case GGML_GLU_OP_SWIGLU: + { + ggml_compute_forward_swiglu(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_get_rel_pos static void ggml_compute_forward_get_rel_pos_f16( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 3a395fdcd..5b384e4ba 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -94,6 +94,7 @@ void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, st void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_glu(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 5e34d79a1..ed5d7aefc 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -254,6 +254,30 @@ void ggml_vec_silu_f32(const int n, float * y, const float * x) { } } +void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) { + int i = 0; +#if defined(__AVX512F__) && defined(__AVX512DQ__) + for (; i + 15 < n; i += 16) { + _mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i))); + } +#elif defined(__AVX2__) && defined(__FMA__) + for (; i + 7 < n; i += 8) { + _mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i))); + } +#elif defined(__SSE2__) + for (; i + 3 < n; i += 4) { + _mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i))); + } +#elif defined(__ARM_NEON) && defined(__aarch64__) + for (; i + 3 < n; i += 4) { + vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i))); + } +#endif + for (; i < n; ++i) { + y[i] = ggml_silu_f32(x[i]) * g[i]; + } +} + ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { int i = 0; ggml_float sum = 0; diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 84f6c0e6d..ebd4b7561 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -905,6 +905,60 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con } } +inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) { + for (int i = 0; i < n; ++i) { + y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f; + } +} + +inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { + for (int i = 0; i < n; ++i) { + float v = GGML_FP16_TO_FP32(x[i]); + y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v * GGML_FP16_TO_FP32(g[i]) : 0.f); + } +} + +#ifdef GGML_GELU_FP16 +inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) { + uint16_t t; + for (int i = 0; i < n; ++i) { + if (x[i] <= -10.0f) { + y[i] = 0.0f; + } else if (x[i] >= 10.0f) { + y[i] = x[i] * g[i]; + } else { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i]; + } + } +} +#else +inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_f32(x[i]) * g[i]; + } +} +#endif + +inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { + const uint16_t * i16 = (const uint16_t *) x; + for (int i = 0; i < n; ++i) { + float v = GGML_FP16_TO_FP32(g[i]); + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v); + } +} + +void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g); + +inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { + for (int i = 0; i < n; ++i) { + float v = GGML_FP16_TO_FP32(x[i]); + float w = GGML_FP16_TO_FP32(g[i]); + y[i] = GGML_FP32_TO_FP16((v/(1.0f + expf(-v))) * w); + } +} + inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE ggml_float sum = 0.0; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 811422f38..086f9a56c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2303,6 +2303,21 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_REGLU: + ggml_cuda_op_reglu(ctx, dst); + break; + case GGML_GLU_OP_GEGLU: + ggml_cuda_op_geglu(ctx, dst); + break; + case GGML_GLU_OP_SWIGLU: + ggml_cuda_op_swiglu(ctx, dst); + break; + default: + return false; + } + break; case GGML_OP_NORM: ggml_cuda_op_norm(ctx, dst); break; @@ -3096,6 +3111,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous_1(op->src[0]); + default: + return false; + } + break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 2c0375fbe..ba3c0f137 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -196,6 +196,95 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_unary(ctx, dst); } +/* gated ops */ + +template +static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) { + const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + // perform base op and multiply with gate (either offset in same tensor or a separate one) + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + + dst[i] = (T)(op((float)x[j0]) * (float)g[j1]); +} + +template +static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, cudaStream_t stream) { + const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE; + unary_gated_op_kernel<<>>(x, g, dst, k, n, o0, o1); +} + +template +void ggml_cuda_op_unary_gated(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + void * src0_d = src0->data; + void * src1_d = src1 ? src1->data : src0->data; + const int64_t src0_o = src0->nb[1]; + const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + void * dst_d = dst->data; + const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(src0->nb[0] == ggml_element_size(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_nrows(dst) == ggml_nrows(src0)); + + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); + GGML_ASSERT(src1->ne[0] == nc); + GGML_ASSERT(src0->type == src1->type); + } + + const int32_t swapped = ((const int32_t *) dst->op_params)[1]; + + if (src0->type == GGML_TYPE_F16) { + half * src0_p = (half *) src0_d; + half * src1_p = (half *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + unary_gated_cuda(src0_p, src1_p, (half *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(half), src1_o / sizeof(half), stream); + } else { + float * src0_p = (float *) src0_d; + float * src1_p = (float *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + unary_gated_cuda(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), stream); + } +} + +void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary_gated(ctx, dst); +} + +void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary_gated(ctx, dst); +} + +void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary_gated(ctx, dst); +} + /* silu_back */ static __device__ __forceinline__ float op_silu_back(float grad, float x) { diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index 6686fc17e..9094f1d0b 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -15,6 +15,7 @@ #define CUDA_SQRT_BLOCK_SIZE 256 #define CUDA_SIN_BLOCK_SIZE 256 #define CUDA_COS_BLOCK_SIZE 256 +#define CUDA_GLU_BLOCK_SIZE 256 void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst); @@ -57,3 +58,9 @@ void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_swiglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 260440aed..7a9aab316 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -422,6 +422,17 @@ typedef struct { int32_t KHW; // KH * KW, pre-computed on CPU to save GPU resources } ggml_metal_kargs_im2col; +typedef struct{ + int32_t ne00; + uint64_t nb01; + int32_t ne10; + uint64_t nb11; + int32_t ne0; + uint64_t nb1; + int32_t i00; + int32_t i10; +} ggml_metal_kargs_glu; + typedef struct { int64_t ne00; int64_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 349f0ff99..12a366957 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -526,6 +526,9 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_SIN, GGML_METAL_KERNEL_TYPE_COS, GGML_METAL_KERNEL_TYPE_NEG, + GGML_METAL_KERNEL_TYPE_REGLU, + GGML_METAL_KERNEL_TYPE_GEGLU, + GGML_METAL_KERNEL_TYPE_SWIGLU, GGML_METAL_KERNEL_TYPE_SUM_ROWS, GGML_METAL_KERNEL_TYPE_MEAN, GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, @@ -1502,6 +1505,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG, neg, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REGLU, reglu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GEGLU, geglu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SWIGLU, swiglu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MEAN, mean, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true); @@ -1680,6 +1686,15 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex default: return false; } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous_1(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; + default: + return false; + } case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -2419,6 +2434,62 @@ static bool ggml_metal_encode_node( GGML_ABORT("fatal error"); } } break; + case GGML_OP_GLU: + { + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + if (src1) { + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + } + + id pipeline = nil; + + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_REGLU: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REGLU].pipeline; + break; + case GGML_GLU_OP_GEGLU: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GEGLU].pipeline; + break; + case GGML_GLU_OP_SWIGLU: + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SWIGLU].pipeline; + break; + default: + GGML_ABORT("fatal error"); + } + + const int32_t swp = ((const int32_t *) dst->op_params)[1]; + + const int32_t i00 = swp ? ne0 : 0; + const int32_t i10 = swp ? 0 : ne0; + + ggml_metal_kargs_glu args = { + /*.ne00 =*/ ne00, + /*.nb01 =*/ nb01, + /*.ne10 =*/ src1 ? ne10 : ne00, + /*.nb11 =*/ src1 ? nb11 : nb01, + /*.ne0 =*/ ne0, + /*.nb1 =*/ nb1, + /*.i00 =*/ src1 ? 0 : i00, + /*.i10 =*/ src1 ? 0 : i10, + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + if (src1) { + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + } else { + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + } + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:3]; + + const int64_t nrows = ggml_nrows(src0); + + const int32_t nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00/2); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_SQR: { GGML_ASSERT(ggml_is_contiguous(src0)); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 984a0ab50..fc3cfe35a 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1191,6 +1191,70 @@ kernel void kernel_neg( dst[tpig] = -src0[tpig]; } +kernel void kernel_reglu( + device const char * src0, + device const char * src1, + device char * dst, + constant ggml_metal_kargs_glu & args, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + dst_row[i0] = x0*x1*(x0 > 0.0f); + } +} + +kernel void kernel_geglu( + device const char * src0, + device const char * src1, + device char * dst, + constant ggml_metal_kargs_glu & args, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float gelu = 0.5f*x0*(1.0f + precise::tanh(SQRT_2_OVER_PI*x0*(1.0f + GELU_COEF_A*x0*x0))); + + dst_row[i0] = gelu*x1; + } +} + +kernel void kernel_swiglu( + device const char * src0, + device const char * src1, + device char * dst, + constant ggml_metal_kargs_glu & args, + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * src0_row = (device const float *) ((device const char *) src0 + tgpig*args.nb01) + args.i00; + device const float * src1_row = (device const float *) ((device const char *) src1 + tgpig*args.nb11) + args.i10; + device float * dst_row = (device float *) ((device char *) dst + tgpig*args.nb1); + + for (int i0 = tpitg; i0 < args.ne0; i0 += ntg) { + const float x0 = src0_row[i0]; + const float x1 = src1_row[i0]; + + const float silu = x0 / (1.0f + exp(-x0)); + + dst_row[i0] = silu*x1; + } +} + template kernel void kernel_sum_rows( constant ggml_metal_kargs_sum_rows & args, diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index c56924ce8..c7788bdb6 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,12 +1,19 @@ #include "common.hpp" +#include "ggml-sycl/presets.hpp" #include "ggml.h" #include "element_wise.hpp" +#define SYCL_GLOBAL_ID_LOOP(K, ITEM) \ + for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0)) + +#define SYCL_LOCAL_ID_CALC(ITEM, IDX) \ + (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX)) + + static void acc_f32(const float * x, const float * y, float * dst, const int ne, const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); + const int nb1, const int nb2, int offset, const sycl::nd_item<1> &item_ct1) { + const int i = SYCL_LOCAL_ID_CALC(item_ct1, 0); if (i >= ne) { return; } @@ -21,248 +28,280 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne, } } +/* Unary OP funcs */ template -static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { - dst[i] = x[i] > static_cast(0.f) ? static_cast(1.f) : ((x[i] < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); - } +static __dpct_inline__ T op_sgn(T x) { + return x > static_cast(0.f) ? static_cast(1.f) : ((x < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); } template -static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { - dst[i] = sycl::fabs(x[i]); - } +static __dpct_inline__ T op_abs(T x) { + return sycl::fabs(x); } template -static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { - for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { - dst[i] = (x[i] > static_cast(0.f)) ? x[i] : sycl::expm1(x[i]); - } +static __dpct_inline__ T op_elu(T x) { + return (x > static_cast(0.f)) ? x : sycl::expm1(x); } template -static void gelu(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { +static __dpct_inline__ T op_gelu(T x) { const T GELU_COEF_A = static_cast(0.044715f); const T SQRT_2_OVER_PI = static_cast(0.79788456080286535587989211986876f); - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - float xi = x[i]; - dst[i] = static_cast(0.5f) * xi * - (static_cast(1.0f) + - sycl::tanh(SQRT_2_OVER_PI * xi * (static_cast(1.0f) + GELU_COEF_A * xi * xi))); + return static_cast(0.5f) * x * + (static_cast(1.0f) + + sycl::tanh(SQRT_2_OVER_PI * x * (static_cast(1.0f) + GELU_COEF_A * x * x))); } template -static void silu(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = x[i] / (static_cast(1.0f) + sycl::native::exp(-x[i])); +static __dpct_inline__ T op_silu(T x) { + return x / (static_cast(1.0f) + sycl::native::exp(-x)); } template -static void gelu_quick(const T *x, T *dst, int k, - const sycl::nd_item<3> &item_ct1) { - const float GELU_QUICK_COEF = -1.702f; - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; - } - dst[i] = x[i] * (static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i]))); +static __dpct_inline__ T op_gelu_quick(T x) { + const T GELU_QUICK_COEF_LOCAL = static_cast(-1.702f); + return x * (static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x))); } template -static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) { +static __dpct_inline__ T op_gelu_erf(T x) { const T SQRT_2_INV = static_cast(0.70710678118654752440084436210484f); - for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) { - auto x_i = x[i]; - dst[i] = static_cast(0.5f) * x_i * (static_cast(1.0f) + sycl::erf(x_i * SQRT_2_INV)); + return static_cast(0.5f) * x * (static_cast(1.0f) + sycl::erf(x * SQRT_2_INV)); +} + +template +static __dpct_inline__ T op_tanh(T x) { + return sycl::tanh(x); +} + +template +static __dpct_inline__ T op_relu(T x) { + return sycl::fmax(x, static_cast(0)); +} + +template +static __dpct_inline__ T op_sigmoid(T x) { + return static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(-x)); +} + +template +static __dpct_inline__ T op_sqrt(T x) { + return sycl::sqrt(x); +} + +template +static __dpct_inline__ T op_sin(T x) { + return sycl::sin(x); +} + +template +static __dpct_inline__ T op_cos(T x) { + return sycl::cos(x); +} + +template +static __dpct_inline__ T op_hardsigmoid(T x) { + return sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); +} + +template +static __dpct_inline__ T op_hardswish(T x) { + return x * sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); +} + +template +static __dpct_inline__ T op_exp(T x) { + return sycl::exp(x); +} + +template +static __dpct_inline__ T op_log(T x) { + if (x <= static_cast(0)) { + return neg_infinity(); + } + return sycl::log(x); +} + +template +static __dpct_inline__ T op_neg(T x) { + return -x; +} + +template +static __dpct_inline__ T op_step(T x) { + return (x > static_cast(0.0f)) ? static_cast(1.0f) : static_cast(0.0f); +} + +template +static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) { + T neg_slope_T = static_cast(negative_slope); + return sycl::fmax(x, static_cast(0)) + + sycl::fmin(x, static_cast(0.0f)) * neg_slope_T; +} + +template +static __dpct_inline__ T op_sqr(T x) { + return x * x; +} + +template +static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) { + return x < static_cast(min_val) ? static_cast(min_val) : (x > static_cast(max_val) ? static_cast(max_val) : x); +} + +template +static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sgn(x[i]); } } template -static void tanh(const T *x, T *dst, int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; - } - dst[i] = sycl::tanh((x[i])); -} - -template -static void relu(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::fmax((x[i]), static_cast(0)); -} - -template -static void sigmoid(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = 1.0f / (static_cast(1.0f) + sycl::native::exp(-x[i])); -} - -template -static void sqrt(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::sqrt(x[i]); -} - -template -static void sin(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::sin(x[i]); -} - -template -static void cos(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::cos(x[i]); -} - -template -static void hardsigmoid(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x[i] + static_cast(3.0f)) / static_cast(6.0f))); -} - -template -static void hardswish(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = x[i] * sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x[i] + static_cast(3.0f)) / static_cast(6.0f))); -} - -template -static void exp(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - dst[i] = sycl::exp(x[i]); -} - -template -static void log(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - T xi = x[i]; - if (xi <= 0) { - dst[i] = neg_infinity(); - } else { - dst[i] = sycl::log(xi); +static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_abs(x[i]); } } template -static void neg(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; +static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_elu(x[i]); } - dst[i] = -x[i]; } template -static void step(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; +static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu(x[i]); } - dst[i] = x[i] > static_cast(0.0f); } template -static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - if (i >= k) { - return; +static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_silu(x[i]); } - dst[i] = sycl::fmax((x[i]), static_cast(0)) + - sycl::fmin((x[i]), static_cast(0.0f)) * negative_slope; } template -static void sqr(const T * x, T * dst, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; +static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu_quick(x[i]); + } +} + +template +static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu_erf(x[i]); + } +} + +template +static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_tanh(x[i]); + } +} + +template +static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_relu(x[i]); + } +} + +template +static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sigmoid(x[i]); + } +} + +template +static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sqrt(x[i]); + } +} + +template +static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sin(x[i]); + } +} + +template +static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_cos(x[i]); + } +} + +template +static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_hardsigmoid(x[i]); + } +} + +template +static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_hardswish(x[i]); + } +} + +template +static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_exp(x[i]); + } +} + +template +static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_log(x[i]); + } +} + +template +static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_neg(x[i]); + } +} + +template +static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_step(x[i]); + } +} + +template +static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_leaky_relu(x[i], negative_slope); + } +} + +template +static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sqr(x[i]); + } +} + +template +static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_clamp(x[i], min_val, max_val); } - dst[i] = x[i] * x[i]; } template @@ -281,10 +320,10 @@ static void upscale(const T *x, T *dst, const int nb00, const int nb01, int i12 = (index / (ne10 * ne11)) % ne12; int i13 = (index / (ne10 * ne11 * ne12)) % ne13; - int i00 = i10 / sf0; - int i01 = i11 / sf1; - int i02 = i12 / sf2; - int i03 = i13 / sf3; + int i00 = static_cast(i10 / sf0); + int i01 = static_cast(i11 / sf1); + int i02 = static_cast(i12 / sf2); + int i03 = static_cast(i13 / sf3); dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); } @@ -292,8 +331,7 @@ static void upscale(const T *x, T *dst, const int nb00, const int nb01, template static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02, const sycl::nd_item<3> &item_ct1) { - int nidx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); + int nidx = SYCL_LOCAL_ID_CALC(item_ct1, 2); if (nidx >= ne0) { return; } @@ -310,246 +348,55 @@ static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne } } - template static void clamp(const T * x, T * dst, const float min, const float max, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; + const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = x[i] < static_cast(min) ? static_cast(min) : (x[i] > static_cast(max) ? static_cast(max) : x[i]); } - - dst[i] = x[i] < static_cast(min) ? static_cast(min) : (x[i] > static_cast(max) ? static_cast(max) : x[i]); } +template +static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_gelu(x[j0]) * g[j1]; + } +} + +template +static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_relu(x[j0]) * g[j1]; + } +} + +template +static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_silu(x[j0]) * g[j1]; + } +} + +namespace ggml_sycl_detail { static void acc_f32_sycl(const float *x, const float *y, float *dst, const int n_elements, const int ne10, const int ne11, const int ne12, const int nb1, const int nb2, const int offset, queue_ptr stream) { - int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE; + int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE); sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1); - }); -} - -template -static void gelu_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { gelu(x, dst, k, item_ct1); }); -} - -template -static void silu_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { silu(x, dst, k, item_ct1); }); -} - -template -static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) { - // hard code for now - const int num_blocks = ceil_div(k, 256); - sycl_parallel_for( - stream, sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), - [=](sycl::nd_item<3> item_ct1) { sgn(x, dst, k, item_ct1); }); -} - -template -static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) { - // hard code for now - const int num_blocks = ceil_div(k, 256); - sycl_parallel_for( - stream, - sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), - [=](sycl::nd_item<3> item_ct1) { abs_op(x, dst, k, item_ct1); }); -} - - -template -static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) { - // hard code for now - const int num_blocks = ceil_div(k, 256); - sycl_parallel_for( - stream, - sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), - [=](sycl::nd_item<3> item_ct1) { elu_op(x, dst, k, item_ct1); }); -} - -template -static void gelu_quick_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { gelu_quick(x, dst, k, item_ct1); }); -} - - -template -static void gelu_erf_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { gelu_erf(x, dst, k, item_ct1); }); -} - -template -static void tanh_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { tanh(x, dst, k, item_ct1); }); -} - -template -static void relu_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { relu(x, dst, k, item_ct1); }); -} - -template -static void hardsigmoid_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE; - sycl_parallel_for( - stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { hardsigmoid(x, dst, k, item_ct1); }); -} - -template -static void hardswish_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE; - sycl_parallel_for( - stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { hardswish(x, dst, k, item_ct1); }); -} - -template -static void exp_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { exp(x, dst, k, item_ct1); }); -} - -template -static void log_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { log(x, dst, k, item_ct1); }); -} - -template -static void neg_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { neg(x, dst, k, item_ct1); }); -} - -template -static void step_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { step(x, dst, k, item_ct1); }); -} - -template -static void sigmoid_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE; - sycl_parallel_for( - stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { sigmoid(x, dst, k, item_ct1); }); -} - -template -static void sqrt_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { sqrt(x, dst, k, item_ct1); }); -} - -template -static void sin_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { sin(x, dst, k, item_ct1); }); -} - -template -static void cos_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { cos(x, dst, k, item_ct1); }); -} - -template -static void leaky_relu_sycl(const T *x, T *dst, const int k, - const float negative_slope, - queue_ptr stream) { - const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { leaky_relu(x, dst, k, negative_slope, item_ct1); }); -} - -template -static void sqr_sycl(const T *x, T *dst, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { sqr(x, dst, k, item_ct1); }); + sycl::nd_range<1>(sycl::range<1>(num_blocks) * + sycl::range<1>(SYCL_ACC_BLOCK_SIZE), + sycl::range<1>(SYCL_ACC_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, + item_ct1); + }); } template @@ -558,7 +405,7 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01, const int ne12, const int ne13, const float sf0, const float sf1, const float sf2, const float sf3, queue_ptr stream) { int dst_size = ne10 * ne11 * ne12 * ne13; - int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE; + int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE); sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE); sycl_parallel_for<1>( stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { @@ -570,7 +417,7 @@ template static void pad_sycl(const T *x, T *dst, const int ne00, const int ne01, const int ne02, const int ne0, const int ne1, const int ne2, queue_ptr stream) { - int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE; + int num_blocks = ceil_div(ne0, SYCL_PAD_BLOCK_SIZE); sycl::range<3> gridDim(ne2, ne1, num_blocks); sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE), @@ -578,115 +425,8 @@ static void pad_sycl(const T *x, T *dst, const int ne00, [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); }); } -template -static void clamp_sycl(const T *x, T *dst, const float min, - const float max, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE; - sycl_parallel_for(stream, - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { clamp(x, dst, min, max, k, item_ct1); }); -} - -inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - - -inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +template +static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); @@ -702,14 +442,14 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst case GGML_TYPE_F16: { auto data_pts = cast_data(dst); - silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); break; } #endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); - silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); break; } default: @@ -717,7 +457,8 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst } } -inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +template +static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); @@ -728,19 +469,66 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->src[0]->type == dst->type); dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;; + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_is_contiguous_1(dst->src[0])); + GGML_ASSERT(ggml_is_contiguous(dst)); + const int32_t swapped = ((const int32_t *) dst->op_params)[1]; + void * src0_d = src0->data; + void * src1_d = src1 ? src1->data : src0->data; + const int64_t src0_o = src0->nb[1]; + const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + void * dst_d = dst->data; + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); + GGML_ASSERT(src1->ne[0] == nc); + GGML_ASSERT(src0->type == src1->type); + } switch (dst->type) { #if defined (GGML_SYCL_F16) case GGML_TYPE_F16: { - auto data_pts = cast_data(dst); - gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + sycl::half * src0_p = (sycl::half *) src0_d; + sycl::half * src1_p = (sycl::half *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + kernel_invoker(src0_p, + src1_p, + (sycl::half *) dst_d, + ggml_nelements(dst), + nc, + src0_o / sizeof(sycl::half), + src1_o / sizeof(sycl::half), + main_stream, + std::forward(args)...); break; } #endif case GGML_TYPE_F32: { - auto data_pts = cast_data(dst); - gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); + float * src0_p = (float *) src0_d; + float * src1_p = (float *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + kernel_invoker(src0_p, + src1_p, + (float *) dst_d, + ggml_nelements(dst), + nc, + src0_o / sizeof(float), + src1_o / sizeof(float), + main_stream, + std::forward(args)...); break; } default: @@ -748,511 +536,8 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst } } -inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - gelu_erf_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - - -inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - - GGML_ASSERT(dst->src[0]->type == dst->type); - float negative_slope; - memcpy(&negative_slope, dst->op_params, sizeof(float)); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - #if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +template +static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); @@ -1274,18 +559,18 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * case GGML_TYPE_F16: { auto data_pts = cast_data(dst); - upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], - dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, - main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], + (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, + main_stream, std::forward(args)...); break; } #endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); - upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], - dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, - main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], + (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, + main_stream, std::forward(args)...); break; } default: @@ -1293,7 +578,8 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * } } -inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +template +static inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { #if defined (GGML_SYCL_F16) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); @@ -1302,7 +588,7 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->type == GGML_TYPE_F32); #endif GGML_ASSERT(dst->src[0]->type == dst->type); - GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); switch (dst->type) { @@ -1310,16 +596,16 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) case GGML_TYPE_F16: { auto data_pts = cast_data(dst); - pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0], - dst->ne[1], dst->ne[2], main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->ne[0], (int)dst->src[0]->ne[1], (int)dst->src[0]->ne[2], (int)dst->ne[0], + (int)dst->ne[1], (int)dst->ne[2], main_stream, std::forward(args)...); break; } #endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); - pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0], - dst->ne[1], dst->ne[2], main_stream); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->ne[0], (int)dst->src[0]->ne[1], (int)dst->src[0]->ne[2], (int)dst->ne[0], + (int)dst->ne[1], (int)dst->ne[2], main_stream, std::forward(args)...); break; } default: @@ -1327,45 +613,320 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } } -inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { -#if defined(GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else +} // namespace ggml_sycl_detail - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - float min; - float max; - memcpy(&min, dst->op_params, sizeof(float)); - memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); - switch (dst->type) { -#if defined(GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } + +static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); } -inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} +static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE), + sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE), + sycl::range<1>(SYCL_TANH_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), + sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE), + sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE), + sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), + sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), + sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), + sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), + sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE), + sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE), + sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), + sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), + sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) { + const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), + sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1); + }); + }, negative_slope); +} + +static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE), + sycl::range<1>(SYCL_SQR_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst, + [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03, + int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3, + queue_ptr stream) { + ggml_sycl_detail::upscale_sycl(src, dst_ptr, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, stream); + }); +} + +static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_pad(ctx, dst, + [](const auto* src, auto* dst_ptr, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, + queue_ptr stream) { + ggml_sycl_detail::pad_sycl(src, dst_ptr, ne00, ne01, ne02, ne0, ne1, ne2, stream); + }); +} + +static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + float min_val; + float max_val; + memcpy(&min_val, dst->op_params, sizeof(float)); + memcpy(&max_val, (float *) dst->op_params + 1, sizeof(float)); + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) { + const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE); + sycl_parallel_for(stream, + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE), + sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + clamp(src, dst_ptr, min_arg, max_arg, k_elements, item_ct1); + }); + }, min_val, max_val); +} + +static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -1381,7 +942,40 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused int offset = dst->op_params[3] / 4; // offset in bytes - acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream); + ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream); +} + +static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); + sycl_parallel_for(main_stream, + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu + sycl_parallel_for(main_stream, + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu + sycl_parallel_for(main_stream, + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); } @@ -1509,3 +1103,18 @@ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_elu(ctx, dst); } + +void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_geglu(ctx, dst); +} + +void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_reglu(ctx, dst); +} + +void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_swiglu(ctx, dst); +} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index bd40113f0..86068b101 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -3,27 +3,30 @@ #include "common.hpp" #include "ggml.h" -#include +#include // For std::numeric_limits template T neg_infinity() { return -std::numeric_limits::infinity(); } -template +template struct typed_data { - const T * src; - T * dst; + const T_Src * src; + T_Dst * dst; }; -template -typed_data cast_data(ggml_tensor * dst) { +template +typed_data cast_data(ggml_tensor * dst) { return { - /* .src = */ static_cast(dst->src[0]->data), - /* .dst = */ static_cast(dst->data) + /* .src = */ static_cast(dst->src[0]->data), + /* .dst = */ static_cast(dst->data) }; } +const float GELU_QUICK_COEF = -1.702f; + + void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst); @@ -73,5 +76,9 @@ void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -#endif // GGML_SYCL_ELEMENTWISE_HPP +void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 9cb36ae99..ae5e06257 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3676,6 +3676,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_REGLU: + ggml_sycl_reglu(ctx, dst); + break; + case GGML_GLU_OP_GEGLU: + ggml_sycl_geglu(ctx, dst); + break; + case GGML_GLU_OP_SWIGLU: + ggml_sycl_swiglu(ctx, dst); + break; + default: + return false; + } + break; case GGML_OP_NORM: ggml_sycl_norm(ctx, dst); break; @@ -4212,6 +4227,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g default: return false; } + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous_1(op->src[0]); + default: + return false; + } + break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index aebcc0391..4696f1fe4 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -437,6 +437,10 @@ struct vk_device_struct { vk_pipeline pipeline_tanh[2]; vk_pipeline pipeline_sigmoid[2]; + vk_pipeline pipeline_geglu[2]; + vk_pipeline pipeline_reglu[2]; + vk_pipeline pipeline_swiglu[2]; + vk_pipeline pipeline_leaky_relu_f32; vk_pipeline pipeline_silu_back_f32; vk_pipeline pipeline_diag_mask_inf_f32; @@ -661,6 +665,13 @@ struct vk_op_push_constants { float param2; }; +struct vk_op_glu_push_constants { + uint32_t N; + uint32_t ne00; + uint32_t ne20; + uint32_t mode; // 0: default, 1: swapped, 2: split +}; + struct vk_op_unary_push_constants { uint32_t ne; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; @@ -2757,6 +2768,15 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_UNARY(sigmoid) #undef CREATE_UNARY +#define CREATE_GLU(name) \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 3, sizeof(vk_op_glu_push_constants), {512, 1, 1}, {}, 1, true); + + CREATE_GLU(geglu) + CREATE_GLU(reglu) + CREATE_GLU(swiglu) +#undef CREATE_GLU + ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); @@ -6473,6 +6493,24 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const break; } return nullptr; + case GGML_OP_GLU: + if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) || + (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) || + (src0->type != dst->type)) { + return nullptr; + } + + switch (ggml_get_glu_op(dst)) { + case GGML_GLU_OP_GEGLU: + return ctx->device->pipeline_geglu[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_REGLU: + return ctx->device->pipeline_reglu[dst->type == GGML_TYPE_F16]; + case GGML_GLU_OP_SWIGLU: + return ctx->device->pipeline_swiglu[dst->type == GGML_TYPE_F16]; + default: + break; + } + return nullptr; case GGML_OP_DIAG_MASK_INF: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_diag_mask_inf_f32; @@ -6933,6 +6971,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_CONCAT: case GGML_OP_UPSCALE: case GGML_OP_UNARY: + case GGML_OP_GLU: case GGML_OP_CONV_2D_DW: { uint32_t ne = ggml_nelements(dst); @@ -6973,7 +7012,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } } - if (op == GGML_OP_SOFT_MAX) { + if (op == GGML_OP_SOFT_MAX || op == GGML_OP_GLU) { // Empty src1 is possible in soft_max, but the shader needs a buffer vk_subbuffer subbuf_y; if (use_src1) { @@ -7566,6 +7605,25 @@ static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, con ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); } +static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { + const bool swapped = (bool)dst->op_params[1]; + const bool split = src1 != nullptr; + + GGML_ASSERT(ggml_is_contiguous(src0)); + + if (!split) { + GGML_ASSERT(src0->ne[0] / 2 == dst->ne[0]); + } else { + GGML_ASSERT(src0->ne[0] == src1->ne[0]); + GGML_ASSERT(src0->ne[0] == dst->ne[0]); + GGML_ASSERT(src0->type == src1->type); + } + + const uint32_t mode = split ? 2 : (swapped ? 1 : 0); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GLU, { (uint32_t)ggml_nelements(dst), (uint32_t)src0->ne[0], (uint32_t)dst->ne[0], mode }, dryrun); +} + static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { int32_t * op_params = (int32_t *)dst->op_params; ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun); @@ -8778,6 +8836,16 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + break; + default: + return false; + } + break; case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_GET_ROWS: @@ -8870,6 +8938,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_RMS_NORM_BACK: case GGML_OP_L2_NORM: case GGML_OP_UNARY: + case GGML_OP_GLU: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: @@ -9013,6 +9082,17 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(node)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + ggml_vk_glu(ctx, compute_ctx, src0, src1, node, dryrun); + break; + default: + return false; + } + break; case GGML_OP_DIAG_MASK_INF: ggml_vk_diag_mask_inf(ctx, compute_ctx, src0, node, dryrun); @@ -9138,8 +9218,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr if (!ok) { if (node->op == GGML_OP_UNARY) { std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast(node->op_params[0])) << ")" << std::endl; - } - else { + } else if (node->op == GGML_OP_GLU) { + std::cerr << __func__ << ": error: op not supported GLU " << node->name << " (" << ggml_glu_op_name(static_cast(node->op_params[0])) << ")" << std::endl; + } else { std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl; } } @@ -9218,6 +9299,17 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(tensor)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + buf = tensor->buffer; + break; + default: + return false; + } + break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: case GGML_OP_FLASH_ATTN_EXT: @@ -10016,6 +10108,19 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return false; } break; + case GGML_OP_GLU: + switch (ggml_get_glu_op(op)) { + case GGML_GLU_OP_GEGLU: + case GGML_GLU_OP_REGLU: + case GGML_GLU_OP_SWIGLU: + return ggml_is_contiguous(op->src[0]) && + (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && + (op->src[0]->type == op->type); + default: + return false; + } + break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { @@ -10746,6 +10851,12 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ABORT("fatal error"); } + } else if (tensor->op == GGML_OP_GLU) { + if (src_clone[1] == nullptr) { + tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]); + } else { + tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]); + } } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { if (src1 == nullptr) { tensor_clone = ggml_dup(ggml_ctx, src_clone[0]); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp new file mode 100644 index 000000000..f4268ed24 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp @@ -0,0 +1,13 @@ +#version 450 + +#include "glu_head.comp" + +const float GELU_COEF_A = 0.044715f; +const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + +float op(float a, float b) { + const float val = SQRT_2_OVER_PI*a*(1.0f + GELU_COEF_A*a*a); + return 0.5f*a*(2.0f - 2.0f / (exp(2 * val) + 1)) * b; +} + +#include "glu_main.comp" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp new file mode 100644 index 000000000..41a298890 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp @@ -0,0 +1,15 @@ +#extension GL_EXT_shader_16bit_storage : require + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer B {A_TYPE data_b[];}; +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; + +layout (push_constant) uniform parameter +{ + uint N; + uint ne00; + uint ne20; + uint mode; +} p; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp new file mode 100644 index 000000000..85cf65a9e --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp @@ -0,0 +1,29 @@ +void main() { + const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; + + if (i >= p.N) { + return; + } + + const uint row = i / p.ne20; + const uint col = i - row * p.ne20; + + if (p.mode == 0) { + // Default + const uint offset = p.ne00 / 2; + const uint idx = row * p.ne00 + col; + + data_d[row * offset + col] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset]))); + } else if (p.mode == 1) { + // Swapped + const uint offset = p.ne00 / 2; + const uint idx = row * p.ne00 + col; + + data_d[row * offset + col] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx]))); + } else { + // Split + const uint idx = row * p.ne00 + col; + + data_d[idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx]))); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp new file mode 100644 index 000000000..0073d8f76 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp @@ -0,0 +1,9 @@ +#version 450 + +#include "glu_head.comp" + +float op(float a, float b) { + return max(a, 0.0f) * b; +} + +#include "glu_main.comp" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp new file mode 100644 index 000000000..a28e7c6cc --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp @@ -0,0 +1,9 @@ +#version 450 + +#include "glu_head.comp" + +float op(float a, float b) { + return a / (1.0f + exp(-a)) * b; +} + +#include "glu_main.comp" diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index a207b98c6..23fc50bf2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -585,6 +585,13 @@ void process_shaders() { string_to_spv("sigmoid_f16", "sigmoid.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("sigmoid_f32", "sigmoid.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("geglu_f16", "geglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("geglu_f32", "geglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("reglu_f16", "reglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("reglu_f32", "reglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("swiglu_f16", "swiglu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("swiglu_f32", "swiglu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("silu_back_f32", "silu_back.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 1262236c0..14000b55a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -982,9 +982,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS", "CROSS_ENTROPY_LOSS_BACK", "OPT_STEP_ADAMW", + + "GLU", }; -static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); +static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1079,9 +1081,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss(x,y)", "cross_entropy_loss_back(x,y)", "adamw(x)", + + "glu(x)", }; -static_assert(GGML_OP_COUNT == 84, "GGML_OP_COUNT != 84"); +static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -1107,6 +1111,15 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = { static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15"); +static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = { + "REGLU", + "GEGLU", + "SWIGLU", +}; + +static_assert(GGML_GLU_OP_COUNT == 3, "GGML_GLU_OP_COUNT != 3"); + + static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -1209,11 +1222,19 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) { return GGML_UNARY_OP_NAME[op]; } +const char * ggml_glu_op_name(enum ggml_glu_op op) { + return GGML_GLU_OP_NAME[op]; +} + const char * ggml_op_desc(const struct ggml_tensor * t) { if (t->op == GGML_OP_UNARY) { enum ggml_unary_op uop = ggml_get_unary_op(t); return ggml_unary_op_name(uop); } + if (t->op == GGML_OP_GLU) { + enum ggml_glu_op gop = ggml_get_glu_op(t); + return ggml_glu_op_name(gop); + } return ggml_op_name(t->op); } @@ -1730,6 +1751,11 @@ enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) { return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0); } +enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor) { + GGML_ASSERT(tensor->op == GGML_OP_GLU); + return (enum ggml_glu_op) ggml_get_op_params_i32(tensor, 0); +} + const char * ggml_get_name(const struct ggml_tensor * tensor) { return tensor->name; } @@ -2609,6 +2635,114 @@ struct ggml_tensor * ggml_exp_inplace( return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_EXP); } +// ggml_glu + +static struct ggml_tensor * ggml_glu_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_glu_op op, + bool swapped) { + GGML_ASSERT(ggml_is_contiguous_1(a)); + + if (b) { + GGML_ASSERT(ggml_is_contiguous_1(b)); + GGML_ASSERT(ggml_are_same_shape(a, b)); + GGML_ASSERT(a->type == b->type); + } + + int64_t ne[GGML_MAX_DIMS] = { a->ne[0] / 2 }; for (int i = 1; i < GGML_MAX_DIMS; i++) ne[i] = a->ne[i]; + struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, GGML_MAX_DIMS, b ? a->ne : ne, NULL, 0); + + ggml_set_op_params_i32(result, 0, (int32_t) op); + ggml_set_op_params_i32(result, 1, (int32_t) swapped); + + result->op = GGML_OP_GLU; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +struct ggml_tensor * ggml_glu( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_glu_op op, + bool swapped) { + return ggml_glu_impl(ctx, a, NULL, op, swapped); +} + +struct ggml_tensor * ggml_glu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_glu_op op) { + return ggml_glu_impl(ctx, a, b, op, false); +} + +// ggml_reglu + +struct ggml_tensor * ggml_reglu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_REGLU, false); +} + +struct ggml_tensor * ggml_reglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_REGLU, true); +} + +struct ggml_tensor * ggml_reglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_REGLU, false); +} + +// ggml_geglu + +struct ggml_tensor * ggml_geglu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU, false); +} + +struct ggml_tensor * ggml_geglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_GEGLU, true); +} + +struct ggml_tensor * ggml_geglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_GEGLU, false); +} + +// ggml_swiglu + +struct ggml_tensor * ggml_swiglu( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_SWIGLU, false); +} + +struct ggml_tensor * ggml_swiglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_glu_impl(ctx, a, NULL, GGML_GLU_OP_SWIGLU, true); +} + +struct ggml_tensor * ggml_swiglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + return ggml_glu_impl(ctx, a, b, GGML_GLU_OP_SWIGLU, false); +} + // ggml_norm static struct ggml_tensor * ggml_norm_impl( diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 71ee431a9..010300df6 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -560,12 +560,20 @@ ggml_tensor * llm_graph_context::build_ffn( switch (type_op) { case LLM_FFN_SILU: - { + if (gate && type_gate == LLM_FFN_PAR) { + cur = ggml_swiglu_split(ctx0, cur, tmp); + cb(cur, "ffn_swiglu", il); + type_gate = LLM_FFN_SEQ; + } else { cur = ggml_silu(ctx0, cur); cb(cur, "ffn_silu", il); } break; case LLM_FFN_GELU: - { + if (gate && type_gate == LLM_FFN_PAR) { + cur = ggml_geglu_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu", il); + type_gate = LLM_FFN_SEQ; + } else { cur = ggml_gelu(ctx0, cur); cb(cur, "ffn_gelu", il); if (act_scales != NULL) { @@ -574,7 +582,11 @@ ggml_tensor * llm_graph_context::build_ffn( } } break; case LLM_FFN_RELU: - { + if (gate && type_gate == LLM_FFN_PAR) { + cur = ggml_reglu_split(ctx0, cur, tmp); + cb(cur, "ffn_reglu", il); + type_gate = LLM_FFN_SEQ; + } else { cur = ggml_relu(ctx0, cur); cb(cur, "ffn_relu", il); } break; @@ -588,32 +600,19 @@ ggml_tensor * llm_graph_context::build_ffn( } break; case LLM_FFN_SWIGLU: { - // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf - int64_t split_point = cur->ne[0] / 2; - // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217 - ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); - ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_silu(ctx0, x0); - cb(cur, "ffn_silu", il); - - cur = ggml_mul(ctx0, x0, x1); - cb(cur, "ffn_mul", il); + cur = ggml_swiglu(ctx0, cur); + cb(cur, "ffn_swiglu", il); } break; case LLM_FFN_GEGLU: { - // Split into two equal parts - int64_t split_point = cur->ne[0] / 2; - // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217 - ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0)); - ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur))); - - x0 = ggml_gelu(ctx0, x0); - cb(x0, "ffn_gelu", il); - - cur = ggml_mul(ctx0, x0, x1); + cur = ggml_geglu(ctx0, cur); cb(cur, "ffn_geglu", il); } break; + case LLM_FFN_REGLU: + { + cur = ggml_reglu(ctx0, cur); + cb(cur, "ffn_reglu", il); + } break; } if (gate && type_gate == LLM_FFN_PAR) { @@ -743,12 +742,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn( switch (type_op) { case LLM_FFN_SILU: - { + if (gate_exps) { + cur = ggml_swiglu_split(ctx0, cur, up); + cb(cur, "ffn_moe_swiglu", il); + } else { cur = ggml_silu(ctx0, cur); cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: - { + if (gate_exps) { + cur = ggml_geglu_split(ctx0, cur, up); + cb(cur, "ffn_moe_geglu", il); + } else { cur = ggml_gelu(ctx0, cur); cb(cur, "ffn_moe_gelu", il); } break; @@ -756,11 +761,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn( GGML_ABORT("fatal error"); } - if (gate_exps) { - cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens] - cb(cur, "ffn_moe_gate_par", il); - } - experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); diff --git a/src/llama-graph.h b/src/llama-graph.h index ee2197e89..ceddb6021 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -38,6 +38,7 @@ enum llm_ffn_op_type { LLM_FFN_RELU_SQR, LLM_FFN_SWIGLU, LLM_FFN_GEGLU, + LLM_FFN_REGLU, }; enum llm_ffn_gate_type { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index ec088bae2..16c426857 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1106,6 +1106,107 @@ struct test_unary : public test_case { }; +// GGML_OP_GLU +struct test_glu : public test_case { + const ggml_glu_op op; + const ggml_type type; + const std::array ne_a; + int v; // view (1 : non-contiguous a) + bool swapped; + + std::string vars() override { + return VARS_TO_STR4(type, ne_a, v, swapped); + } + + test_glu(ggml_glu_op op, + ggml_type type = GGML_TYPE_F32, + std::array ne_a = {128, 2, 2, 2}, + int v = 0, + bool swapped = false) + : op(op), type(type), ne_a(ne_a), v(v), swapped(swapped) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a; + if (v & 1) { + auto ne = ne_a; ne[0] *= 3; + a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0); + ggml_set_name(a, "view_of_a"); + } else { + a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_name(a, "a"); + } + + ggml_tensor * out = ggml_glu(ctx, a, op, swapped); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + // test extended range of values to check for NaNs in GELU + init_tensor_uniform(t, -150.f, 150.f); + } + } +}; + +struct test_glu_split : public test_case { + const ggml_glu_op op; + const ggml_type type; + const std::array ne_a; + int v; // view (1 : non-contiguous a) + + std::string vars() override { + return VARS_TO_STR3(type, ne_a, v) + ",split"; + } + + test_glu_split(ggml_glu_op op, + ggml_type type = GGML_TYPE_F32, + std::array ne_a = {128, 2, 2, 2}, + int v = 0) + : op(op), type(type), ne_a(ne_a), v(v) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a; + ggml_tensor * b; + if (v & 1) { + auto ne = ne_a; ne[0] *= 3; + a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0); + ggml_set_name(a, "view_of_a"); + + b = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(b, "b"); + + b = ggml_view_4d(ctx, b, ne_a[0], ne_a[1], ne_a[2], ne_a[3], b->nb[1], b->nb[2], b->nb[3], 0); + ggml_set_name(a, "view_of_b"); + } else { + a = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_name(a, "a"); + + b = ggml_new_tensor(ctx, type, 4, ne_a.data()); + ggml_set_name(b, "b"); + } + + ggml_tensor * out = ggml_glu_split(ctx, a, b, op); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + // test extended range of values to check for NaNs in GELU + init_tensor_uniform(t, -150.f, 150.f); + } + } +}; + // GGML_OP_GET_ROWS struct test_get_rows : public test_case { const ggml_type type; @@ -4094,6 +4195,21 @@ static std::vector> make_test_cases_eval() { } } + // glu ops + for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) { + for (int v : {0, 1}) { + for (int op = 0; op < GGML_GLU_OP_COUNT; op++) { + for (bool swapped : {false, true}) { + test_cases.emplace_back(new test_glu((ggml_glu_op) op, type, { 128, 2, 2, 2 }, v, swapped)); + test_cases.emplace_back(new test_glu((ggml_glu_op) op, type, { 5, 7, 11, 13 }, v, swapped)); + } + + test_cases.emplace_back(new test_glu_split((ggml_glu_op) op, type, { 128, 2, 2, 2 }, v)); + test_cases.emplace_back(new test_glu_split((ggml_glu_op) op, type, { 5, 7, 11, 13 }, v)); + } + } + } + test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false)); for (ggml_type type : all_types) { for (int b : {1, 7}) { From a5d1fb6212298db1be1639db4c03adb2c522ee13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 29 Jun 2025 14:38:10 +0200 Subject: [PATCH 32/54] ggml : fix unmerged GGML_FPxx_TO_FPxx refactoring (#14443) --- ggml/src/ggml-cpu/vec.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index ebd4b7561..d5507d756 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -913,8 +913,8 @@ inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v * GGML_FP16_TO_FP32(g[i]) : 0.f); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f); } } @@ -927,9 +927,9 @@ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, c } else if (x[i] >= 10.0f) { y[i] = x[i] * g[i]; } else { - ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]); memcpy(&t, &fp16, sizeof(uint16_t)); - y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i]; + y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i]; } } } @@ -944,8 +944,8 @@ inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, c inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { const uint16_t * i16 = (const uint16_t *) x; for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(g[i]); - y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v); + float v = GGML_CPU_FP16_TO_FP32(g[i]); + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v); } } @@ -953,9 +953,9 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) { for (int i = 0; i < n; ++i) { - float v = GGML_FP16_TO_FP32(x[i]); - float w = GGML_FP16_TO_FP32(g[i]); - y[i] = GGML_FP32_TO_FP16((v/(1.0f + expf(-v))) * w); + float v = GGML_CPU_FP16_TO_FP32(x[i]); + float w = GGML_CPU_FP16_TO_FP32(g[i]); + y[i] = GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w); } } From f47c1d7106e49062279bcc57fc1077c0db61e278 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 29 Jun 2025 21:07:58 +0530 Subject: [PATCH 33/54] SYCL: disable faulty fp16 exp kernel (#14395) * SYCL: disable faulty fp16 CPU exponent for now * Revert "SYCL: disable faulty fp16 CPU exponent for now" This reverts commit ed0aab1ec31b4eb4b0f275dd7acd41d96a375202. * SYCL: disable faulty fp16 CPU exponent for now * Fix logic of disabling exponent kernel --- ggml/src/ggml-sycl/ggml-sycl.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ae5e06257..4ecca4165 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4215,7 +4215,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_TANH: - case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_ELU: @@ -4224,6 +4223,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g #else return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type); #endif + case GGML_UNARY_OP_EXP: + // Disable FP16 until we find out the root cause of failing fp16 sycl::exp + return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type) && op->src[0]->type == GGML_TYPE_F32; default: return false; } From 83790b0e7e09ab17238b16452a33053a71dbdfad Mon Sep 17 00:00:00 2001 From: Renat Date: Sun, 29 Jun 2025 19:29:57 +0200 Subject: [PATCH 34/54] server : fix appearance of the chats list context menu for Safari (#14322) --- tools/server/public/index.html.gz | Bin 1913886 -> 1913892 bytes tools/server/webui/src/components/Sidebar.tsx | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 0fb01665ae5ccf999a186e13ea09b8f67de3fa1e..53b71079c1e2a3ffebc5fbc70d6b30ca3de8f13c 100644 GIT binary patch delta 1705676 zcmV(jK=!|$=yas$bbz!0Nz;GgXkC6kRlHc!{y5h5otK4Ha~>XWZ|of(My%q%I@LW~ z57D_P8|T`QdKaN~_ggq|<9W3t8n4_(0zY=VY<}x?T#-vqSv1kHR)xLqlB+slR(@SQ z;l3_i)m?NW7kY`~po$yzPu)vRY*75Z`vRyo~E*>IX?0( zkGk@B)DL6yGBvyWiF~2_+-S`Vu31yB9ghY_ob@+a9gYnyu{Sf=9c%vHq@3+S&d{e~ zR^O!K#!Eb(ZTIDkeaCsasrZvPsM6I;p>ok@_tga;nTzvzyTN}2*6}&_qxh}bXzt$L zJ3E`!PeXzk1wy6xCUWmRN!bX!Oxy3F%jdPV9K&k9T;g^J`)VIawl`@UE-Gs*Yq{77 zg_Y_^Blz`7Lf%ViiN!?+Q=^90HbcTYK>Sodwm4G>d7|tlf?m#;o)lhjK>gzs*6nE# z8%HzE-R{Lb>6U*QuFMn?oJgj{f8$xc_*q5x{c;GzjSr$U^MHtdXgoVZj2^z3=4>Qa zs(Aiu;HgQi+PE^!%_07j+VGz$vcMyO4xkl9Ldy=+SHjMyOE0w+V z&dfDkpie_+aIF8>T(eiE*z77Zv(z=~;M>S#`0`2}iQ+oRGojJJC)8Hs`agGnc>U`5 z%iXQlh%8Tr=X;8ez93fl<=p*lox5~x&h6d^cZ@&&aD7yLKk?(*H-7zQOI=OIdznKG zgYWyBEp2}YoA0;QA8*lS94yJJuJ@Hiy}MJdhOc_B-n=^5yZ$zF=VRMI7-Z+o9rcY$ z-*CFJ*@cPC5zLU|x z5M<(#S~{INH_EXZz5<7*4SYi>?%2i#89+0G3}AnY*ry;%59XD9*jF3Vhm2Ao!Iw}5 z`(hx7X*k+zW!wTM`OvxauX)s;oo)XVFl}R|sBDUxeVoe1#>|v-QF|`gHK{-f9?bJC z96&alk6#vsQ?wot)-Ui^!bcm(e>BSjdBHF?gT7+}zy1GcP9BE>d>;<8)$D1(PlC4x zM`M3UCJ=@7X&v6Y;N#w?yWct)10&k)Hg{U`%8VdjEZLfPak72=@MP!mT)O0VI%vu9 zl?_Zeovjmn7yC}r{cx)>7nl&xPvZSY&?o&4*4*!%q5Hkz(_2SA-@tw|V|v8GJkJ04 z=``;OtCKKT<*}Mo3xQ3$Z2r|n{3ax=5ZDBZ)UI2bHR)P2^jWuPx*uZ zp((-A)LMh_`eLy4$|rQuP&&)HllDHBxL%pX=cDl&OKgAzS}E6^euUP)d_sSh?){B} z;YdFAY&AK^$-!euND}0Zsp-dPDOIZMS@el5{L82L-GgjCGsJ}NdPnt&V`pJJTQvWZB)pc>FOp&Tm6B%L%_G!qo)f(cH-;bLCg35oPZ^IUybr<2` z6Vn3-zA58(w?5Gl_JZ);uL#+m^usr|Pn!GG8xrp{2ao#4qvQ1X9TUfe^t<@0>~QlO zlhKM{<(v!;M#HSF+Y&sLqBK4AZoBn?41RdFnWU826cs{!_vQIo%CZH3lb&ic0{G!c zTldK3$T^k-(T+p@NSuGG+XcNU8psJfo6SjrQA4@SHLl@!&bRnLji|bx0l@{)_Keu8s!=0L7N=o;eu`Cxy zvpA1GfVSoFCv4oV2$Ld*wfO5+zWR0JT3q~1ZhMaLODq4~T91Dgt@*gWigmi`7DU&W zB3RVA21YI>ZEOwL2d9`_vv8Sml=h%?Ug)^B$Dz*qLD$t@mlg>$@MBfkE3W?i{V>ng zNHIt*gz|>9UQ=V2S75Mc3gPWGBVYxhT&E(>7;Q8q z1_PGmTTsyola_W2wImYpG5mE)86d<=;HBaTvRD;B*6)83+b7m|9|aSIEfQ#R3fgX@QFwwIwQYIxPCAl&@5)8qZoaH`rrt3 z3sCpAC&co0n!ZeoI6cH|ZU{@D``j0mK#!Rt8XDX9OaC}EW5m^Uq0Uh(acjR*Xu%ds zIuC!}3K$il40QqUsh!Fq?%rl(fQPz{%i<=xO8}59gh#sYE)UNwwEDOzh*4v&LPShh zh&cKkBPU&R_3h*ET1Ull?`l=1NXBX<_2U#ZZY7I9jja~N(3&cizMWx7ZasSw^?_HXU zz&2qk$Ab3Bpw;Ylgp3yU#|wq+_Wp67$>wI!xTXSBx6W^ww?X7YQ(9slfW<2P zK}EkF^<^r-heB(QIS$&9T0Gj|ipS(D*_*kG7k=6-1-8W<((byHw{H zcuM7PLJ;hYJa>>}Ro|RM4$Fc!T|XVG>m*)uNov)pgC=?MJYnyx&e|dN5k@!gfy#X+ zGT1Ke-qsTB^*v2B7S5!C7TN)`J@9wUtw@InzQYRc^E}wMmxwg`{c3;x-~*~KHT48G zF)RTyFA>5b+L{mH{!k0oFZ;m=nf{57h=1~-{&6bQ@GuR$h(e?{A8sUs`;ZXsLmWyw zqCOG+a}B`J$XDe~o)DqXRRMP5ZtM?QkJ`vsPg-|-KHyGfmh>1+#9>SOq43yfCZpQ+ zYLS(;vAm&UGMulrSwpCb@F3ZkQ`wgXsXl|; zv@DH&fMkP@CKuCvGf}^agODYF+^hVDPInGT5)mGVS>^YYqQ&CTXwG&-dMyXnEcIwMLH7j#-a2iv!q(nAU?+sK{lI-Pj<18 zq)jU%5O&~alF>>3&k}fvrHECkk0hLRO)|?!w=5^D#*xusPQVvA`XpW zl0&3Y>Wfk;b72yV3`Z@V<0#TIP(AE;6&J0Eez;_f8X=DzAsP?_^83DZB8Ix+g3<>K z;QM-U<-y%{1iSN4NbriPz2*ly@>B@aoVgAUl154b+oPeFW?C#+qHQRan!bu3JkSPV zpq@Q=B_w~fCI^E4qV-BBD>9YFG07l$$6?V_Jj$xEhNs8b*ChA?f}o=!hLM0lbkic9 zN!*$nY;0b40iqQp)5VA%Sp!yCbzvfTzQHO)dcO2|Bz4xJPD}5S6Y?B4WTk7&9gn zQHqD8-Uk^8i7gj2sOy>Ul@pw}xKB69fgnMZ)*U>6yip)R?iy|w%wdN?B_Igkz8m*U z%0hn%SNTV&jZWSI*uSYvLKlNsP|zTI0g4WUTh5sX#(`OK+UEPAqi|^g2%fcxe0JLW(I$Cr}>cZQ8A?j;SLEr&2^kal<^2?A;F@g z2k0?jWbF}d5+4uOYh4GI9n?vO3~E7xIw6sO&PYHeDl(u21VhgDGi?4{$t@k5Db7c0 zc8sQ|lF+CU;&A-LmF^iLoe{LXMu43Rw8NUrCgY?mo^w*n+p%Yp6bvkq!jXlxgvdeb zsoi_-T5OL^BUS)s!&ZQ2V^_?d4WP4;wFL>kadM(H=m51tAWcE?EYJvBAX$GNXoW46 zWP*!3*qB4p#X&AlzGoJ?vUF)2hYa_*l1aQ>Hsc7HqDlv&2qeZ59hiF@LadE zD)5je@J>aq!a?2J5=&DYLD_%wMp!nrDPK0NqaIdNku9|k+sa{)+CH(E^w=%okVXr& zf{6|Xk60wER}0q=Gt~4H4nfHA5toe|PSz_QYRwoU30XV6VkAD)LY`j{9>a&y}8%lp_0uhviX3fvs2RK>aKH@hg(e+~3T;vl$ze*CHBqPbc1)0T0X)c! zQgWeZC7IBLDkAV1s66EYVF#9ABZHQ8Lfk7#ANE5SM%l}vqiBCAR}aZ=S@gc&Pd_iX z-Z)!1lB;@<#F{B{dmwE0k#J+q6VbNXq5Ng3mCF$IO9=aR9{N&Rhi_$fC+L_2T^SdX z!kE=g(POeMuapu3NkrcIbgVOxbwDcCC%t9|?yf&NL18KCoH@a{ELO%)*lzePO2$HB zyoz#R%FCk}R%L(UvlI^{dqo${SE-j4y#q3vI3>WMG%dzWh*pSs=~;q#d07UCS39|M zECRetEUMqYCHwn?@C!HRLVnub_y)bLODF8@y#0+rv=EsrI&&XhBp^$vQi*Xf8E_?| z$=s7>Q2=Ly?k+&X<%i#+qza_VDE}z9kR5Oy&&J_G?V*1tk5jq15Uv=1p?Fj&%63M9 zi}E+GJOnjwcAm8d?fo`XIw%qOCD-oFCnX~gm}?UXdT)Q~_IKX?F4*5i`wPamI_$eA z+##@`xVXCORd%dAj@OUG7yBU~y2B;`PZ8rfMa0+%C;Ck1$cL=F91V*CM6Z;T3k^L8 zCeJ{<){TFjjw8C8($md&1?2^aSyTeVESf1&IDp>}GDtSV4&WF@4;c6h9)-qHM9@Ms z<6=sBQ@N@XL8(Y0#8dlo)jXHxRRVoVC|pTFN7W)pzi+bkRV0|1mX(*8<`EN2<^U&= z43I*y2}DE0M4=&4{3s}iB*|xr<~c{*hulVe2pE6V-$%DIDV(_DMNsW3XnqA*=!KeK z6V<%pruZMMzxDsC3f6aPV>u~e9?kTN{sWb>{*7u_m+D>pU(vD3M0EeDw608;_T9Qx z-=RB|b?QG-D+DH$R_H;=4RF&&P#8Q0WNjVC~3(7SG9Erz;oC zB~T%JV`w~8h!{f^A}=i>p)H$<$tj|Om;-?W2Sd=d-46-IYkWZ8H#Hmj*pyyYNy7qO z$+uG+W*~NxRm|QzkD(Vy?Mx-GbJ|Z)hUkB@Xz{qmEExbA9ir}Va!U_GrI|8tMLFoN zWJPnN2xIO_0I~5NKr$!Rqup|m_;(3~&Ml+B#4IgZUby$*pdd0P<&DKc@ zbiL*=aziuV5LDPuT0yHRKVt?MFRkLSZNPjsMaLuW<7G9O7MmAQdcffpHYu_N1~Gr^ zdOTFnb2I3dfu2t|pa<2WD5j0k!dY#yP}cP4CW>EuN@}QiJjlXH{>luE$y>9GLBLu9 znNT%@WpY$kcP2KCfQ;|WwEeJCCvCDFn`7nWdCkq%Q4`x9VK_3#mVa6j6Ygun$ud6S zEffSJ;9CJy`dBj(w5&A_MZH5##`b^A%1=>25z)d%9$e6fLnhvh%3qmOH^NFdYnu4k ziKr;Ko2d9GVptraq5>p>UUkRQo+RrlMspd?$kXVNKjw{#hw0!1VX#3>pc}qRV&EL$ zguek2B)&lv&JS@e@l{emW-CB0rFCBfL9kMD`LE^9wd#Tl*Cj;BZ6WGXy~lqL_Zg8~ zOBe}L;dq=$9WCfPlFN8=<!~V$+`zLMvoRS?Y5+*kYe`%es_tbxP>c2ak z_}yp!1IKWqclgM^4wFXSQyD!GZ@OE1_3_&J-s}(8F}JndxL(_B);?p5qBYP^rx7h>g#8VBFfM7gwYO*agYBHft5<_% znz`zt*(R~e@~c;$ZaM2IL!Q|4g8rS_4Bky9aC4CeI_+cyiB6Urepp1K*kIZ@-Tv-&3-*ZF8e1+qt!i zf_#RBgX(ax+n$wYrsu+Sto1n;`CtMT-H zwX>aC@(&rRXEk-U1BrNgf1|VAsBh?#PUHHk>#u&?{pXK+Kfd~Lc6#@p*MHm#v6&xV zU57*s$CN@O^Y7KGzn@*3`Em2_XBCT8ELJ&tMg22-KW@I7Ioti``NrPN z?96qn*w;SsSsDX2h#IjCwL~H2nk;ucVj5#uFT;NJ=d_|8fWs+n#N_ z9D0I{O8|Mkv0SpgXUr2;!Z5$L)A;Ythns>s(_hbr>pQbCn$YJSNT5QVV0y*G4BNV` z8g+Z?JJEl@e;k*W$)*Ht2$$4mS>>r3UB#UMj(&#cseFSw<4eY^(^O}A2_9^`MwjG{ zD~-=SaQ?Jl)yU3Kjpu2+w({{xo*csGvzRQK-L}6w{GF;W49+~ETo5&}9IQ4G(S4od zsIg)J)xyTMLavdG9ff@hpI5JtSDy_%V$k>4uA6_QdBn!;0k(u))h>aldpX>l+Y2N8 zc=|b`_`LDl?|0AROczI0gXL*4y zhqHf_u`x3{OWSRHW*hUY*+!nFMi$@LEm=rDl<_~mo-y5zDKT4Weq6&4QVh;%VmEhG zaCw+%Pt_8h46&i29pTtaL+KRy9230R$p5a9Ph&&jjhk5X36shXgIB$B6GwhoV*~3I znmP6N^}na4-jpM$9nF@n{WQrUZc{7>64`&7j+tt9@2FVdx9Q1P^7_kU-|(jF2=3&# zbCmT!mMaZ!3}HO(&?WUULl<&Hd|~s~2RYvOkS%_dp4sE0eUW55-+Uh1*{7W}Gj2S# zP4%+f>89VEdU1Dp*WKu)-RWKT>eYAMYp-9oyT{xb_=DxyS)^6T8!{&HdSq-3+?#*% z&EEQ#U>~7(e!el;w%EIFwUWNUa;O_}bK*h0zJNx{nlo&4rnQ~s`#fwUrn{7i=_#3! z(<1->#^uLRwK^-9_Bj0$EtDZ+i)S;Nd;4Pa2?fX;}s*A)3|n!K2cL{kvKrF z@{QY;T*-{93KAx^$=9-^rq6$|Gs?~!Ag+*nTQh#yf2Tr}%i6?;9LEBPmStb5xWp94 zsg?~-KPQ`=^}4qfd8ZvM@EPYY$q1)2yB7`Y38Vb#b!J|fCO4h#G~O1siIrxVT*h*u z&e-IO!$l+;uGS_wjVv;g*?=pTAMjF^Fk1$eTus`+gLTgH?iu!aS`vTfIGX)p{^4We zg_*wbo9)IBA1T<<1X#@!6XGwYoG z=4w(x!lg;64vi+IMgk6tvA6*-#Ey^eY2T`hA0a1pS^=u)W?Zd}8TGf*-Fn%n;fD z7u9d^(l^?d>D%kvI}GvRg(_U@I!zqbM#FXo&uujfLi57aUPDwe8^`aQWYxrrY@5+{ z92Cn__K8P-ZmSH}rv3;Y{2}RUa1#+2Bgm&An^aE&{(FBLrzTE4lqJ9ZvA)kG4?~%+ z-n#XS-8=GqUu%c^B5wtbXK)4bM^?wW9D}s}pn1JU_Z=pdLq%ThT57>}R&RBKI)`kb zW?wT7MycGvcNRRjxa3}A6#L=+4gwW#H@<|q!3}Nn37GAhduDIJZXE|?NfG0Ioo6Ea zP$1jamp6ZM^=L_T`Q)M`2rFMpi5(Tzj-;I@s=)a=3J?T+CCKzlAlCtDaMpr?H~LOq zXU3^?;{SIHh!y!dcl2hGPrV62JIh(Z1KonS1O@E_v#rns>I`3t-Xgf%{O$MkBdi{) z+_1v4UQdM6Wz>J|0Y<0w3x+eyp28E%BfAz)gj;{(IA}CZ*D#9DJ&$Lm!qF)03hCUm zIaqJ|fn)8qCwY5}o9=liG4Y7h@2^_XZ`|Y@rC2x)ywjGYPP%U-oa*D7*J*GU4$(yq z05Go+^{(I|fJ3sx0W;MHYD2Tih3`!DA1{BVo7H})&3!o+jy;kEYLB*Dw)0^w#`E~$ za8kEG@&}>cYZS&_W@nv1Cepwc@|a_@Osd$k`J~qPVGJb&8{!Se?7Btam^9kFT*$E-u~UtE0$+&2Ro6*tb4pOr(iPh-E4pC zyKzBUQ$o0ZPxOfEG!8*@^f5)*y{fNU<;6l$v#}?im*!V=#B(PFO6k_};D*AVd=vde z8Ufi(g=^x&o+7UlaNdXJks@6=xyCuv3I1BrZFYp!u~ThgakC|l1lW;tS(HzOvETcl z2xy+TxgMK{hAuiP8a3>;6c^g|JFS1~VOQwPCYw?i!v5xH+I_cffxJHWq9 z3$^u2wdum!*T&TpavcQ5Va!8IpL%;#r-@V63%q-|5BB3^2~n?@3(Qv zET;uN*Dn3U3_fP{Co%z7uQs(%?I{~IdW`GmvX)+%cyDr0UT@*0pwZ_pA9-7RyLE8J60Yh)!e(4ValzwGm|P5z0%D6zW9*pf!p zk>j+pCmI~t$BN76sH^+EjQ)QkVV+3;2Ub(`I%FqfUlj@xSz5ngK0m^=@oC@f8Dnl- zF5g4?*TN+z!W#46Ri@^-t1bw(5l}SLP5HJ1KjMb8>zN?T@eb1ob2U<%y^yCce;tJ z?)qw~?L9v>kim-%t#&%FQ)^_PHXm*6re<@?k|LmWq;J*MUbcVjShI_yad>k2<@FXk ziDay%*T5y2MjmgYV<*{!xo!9>Vf&qSbg;|FvdsRlZv8P8Tfj|SI@o7i7Egus6asB% zlW|GRZsXxqqbnDRh}bfmV1j!#Neeq;a-3>YlEudH+HB=-SV*FASf`_L&LCMWYZ~fi zYN@{sE+{k*BHic$G`XY1PgqVkW*4De~@9%)w4mrs0 zvE%7CN};Z_W?(0s7w2tha@oUtyY?KOM{kUX9D=`_$~8Q}MOq%Tl>~JZ9kNwz2GTV6 zsa>W>{RZyZ+V~ur+4R+-LCUD(K#~XaMkthezCO88mIi;NXoj!vx3oU9LwB$|*-ZE8 zeG~PVaq=ryrjaW+%C9jE#lymzSP7k1WD@KB^IoIrNPSNYmNvr`G>=SAGiXI#*n#0i zt!GFJyg;ly+Z^3?;ZY_qV7ep=@+lZF4kanh#APvPF@+e?l!ixd`k<^ zy|taXzx987%5A2|84~0Z?^5=->=$g=`&Ezrsyz5`qnCdz=<->vsCF9j-M4Bl)@OBHymg<8 zMfc)=S8hkTcm^>C7`qj7obj&_waZ=CUw)`7a{zV>pZnqn@uD#s*17CW-)#R6< zus!$2dRIPtu^HOZkBma_V=9+>RW#T2UG&ZkBy*t<{TJ-Ivptue()Z5F1LBNF?^ZtO z2{wNKnMo$c=&+5?>xu}5ZW7vNIah%$WO&)omt8PC7~W zA#$5Fe2h))Pjj2YMs4_Wdvk8RJ&Pcwp>SKeCJppvU`OZRVBM!b)mw`*O)u?`l?bUT z!46^94AU>pO`$y484*JZ0e;;!QX_wxrpe)wY+43Cr85ico+o|0j+#|1g<=u76^kWP zgIrbg2KmXAv%lWo-0kmiPbOE8vFYSbO;1NuTaa-Wf=q8? zh8?lBcMYpWCgag-ektML|Jcb~ZG3D_=J+p{8a^*Qdhy3FFS&+51ep+)2pkC+q z=lTMN!k-rAEj+)hNPT%uv88{d`L!j*msS=Q7X?{fTw5{7^3vSgs-#9&mzS0nmbAny zFb=Zs>h<~}|28ZC>wo@_%DOwH*;>$IHPadeJ>3Tf6TJan-*qbfQ_}FSngV}J`sdWy zl(gFIjX=IxPkdT38Z>SkQXUuY#-gHZ*LILshY9xVorC?Y_V*uWU)6tVYmH43|5^L* zBxYVcALNaN#%SY}OxDH}n%Jn%uY`(U-?k!O9a8+ZFJ4vt+N`}%m3KYtO6~f3?Z55T z{`gX=ZHs%n3AJ^8s?BHbaaahGo5Z}TMGN!UXdf4HOZ&*Ty9R!^+~Gye?`oZR+{Z2K zE*wu+Ug2>1(fOHvcFup<^GBR^s&|kitCNGg;Xp3dS`3&Oo7zGPP_)mE4mrMf0^Fr$C_;c9K8u~~ch8uVgPFYX&pw&YOXzkGigOG`zE6*_P$G@Tj8 zn##tpw7f8n{HOmjfjZk^-+i=sQAN6fX*cRK>kNgKtvlXb)T#}{jFNkcPO!SX|!hBIBwTCQ;#GeeiNAYr%qR?Ok2#(hS`+DCH%eJ`kpRoJ%ZlTI}2rL_Ye%5AbN>Bl<6hNY z13azOHgr!Cvh6NreO!_f}!-2U_1`dlJV7V639 zc-Y{OIH{;B?Q5^!zQJoH*PL6YNCMjjW-wCU+(CqpZ3HE(L*Yz29A2HK54U-W@zPK% z9puzS|E)udswSkWer1;(SuQDh8CC5xKc9c@>Q??EoTJDW7n(37VW8NqLZW-1P*rC$ zR;p0uq*jNOLc33^w~OsQ&A!%>@K-gor#yILt3^7Vg|gHx&FbSw%;HGc!7xx7rm9G= z+KsjptNeJ#){bc@&}AcB8TGYgQ2p{ckfM|IcCg0jNU=W7Nb!Ip*z-ao6)QIKbJ%|@ z^Oa`GBVTJbg!XZP&_152PApBlhgMO-PftPIR-JwRJZtLb>BrT3fhax=kL_he673wf zT<&24z|ENTrTUobAJ=uf+9O3r#lbcoN***Gb7|+EQ|-KOGj;yG22hz{XiY!Ws?L$p z=Ws~r4wO6}D$x`pHCvvxg~Z)0qc4B%bBDG+iCbg+6*548c!V?t3(s=k+T2ik8hdjh zjXmJ09=ed#*RCxG!!n^g7+yqam2v^86|>9f_I0ATK64_u1L(}$xhl1_&XcQ+txITb z?4Hrw*XPwOXl_ny4|r}lPHk^aptf82(;{KC#({maFe&$HwW*&U$3w?cw$*=odr-*| zjd-g4^r?1HjgHqQLFRHEN^3k$m9d<^1bo>Q&Orr&3Ot;2ctP;hy)$e7PFZ2dSh=rq zph3oGpBDH&|6F4H3+K=4nwp*6mg%gZ7uu{%p1^Io_N^~w3-QQ~db+;xM1y+_9bnJS z?u;a__3gF?MUQ^ZAldo&4)A~Pl8>gNA_7u`4ugJ2Cbc>nEbA+JzvJ6Kuv?7=$~PJZ z?teuMII_i^%(Ha=y50EV+?Q)$VV5NGPsOf3eurTk&)~h!8NA0Od6JlYuNwuYRqakX z=W48w9EDY9PB%DP3zRLl*pMA(fx4CDaTw#CFLQ9)==nw6fi)~HW`loxwN#hqwcKWE z{X0^MwITCS?n<*M?pZ)e?XJ$O7+~UpGHx~c1q$1x|0~&(!>p|f7(#Tv^$vK7|94Pzpd}Fjd z{jyr)ctqe>kIb6>w)=mO;{G>#U$)iO)vbA5^%S;F1_KqF#9-rujb&B90XHPr#tvA7 z@-JI5h^PyaVvD9mUDZ3(Bh+i1C#mn7tD9>#5;#6pHf&Hi5E{e;7(Yo*RXsA!K4VmuyfU8vi32_sHzWg)@1zE~{UkX$vMbtsnv9QO z$zozNg`{JgTe&Y_h$wl@6AXO9m+ry5f>|y~a0OuZ^j8zLqeH<7g&)Jj%BD``gOS zsIbj$ub`l5oSc>`9nFz|jUXHGcl~@1j6f_S3ZtDGIy`?0I!-MGv)j;w0vx%J7nRy$ z6Op@7yW0bJ21;86!wLOCIAHO1Si6ffxcT!f@~OAMWO!osd9t3A;67NbH_`@;z2L|* zBqiE&L1h@GLqc#M1JNTy*r&K=(Wn|>84Zq?043ISTfY2$|6u~mH-tQZD=$*)LH`M| z8+*_r(y4!_?8C|aM4#+W{LRY>Yg|`ge;g!d)4b%&7p(X*mu4s;l2A>9{HY*U=6h)M z#sl=20xB($3(c7DL9<;bzgbObwfFErosX~*pGH!)@G)~exPNsef1x{)zYnjjAXR6f zeW3mf@GDyc({vGQKV=#~1`nPmi!;l|9RN%=lr(>tkI4qI0M;4Fs^Ur?%6IWVs^R(u zQxUyD(%1~Ex%2WD^-IzeXmy~4!jQ>$$KBBjm0gPZ0O@~}xmQhpKg&<(q?+=tmav3^ z%w#oKbDW7u(M&%qm?qRXe>Ckz>0bQOYj$8#^B*(B!>A8@lIx@+Ax%0D@4LNur)J(` z6KQ`0%Tv{e9dFdg;_c>eunNEeoLjif(vrpR3uk&?IGgFhLA|xtL5!+(Vb?mb_o9WZ=KqzhX;*U&8pqdA7@HcWVAmlMOMSH|$k zXWP^L*~{4&^PDQ&?BFYcp{KvY!&Pmf9`k>*UYz|scYNym9bM?gT)y{T8Gss-9Wn#8 zR>KEw?26tMkQt0C95of>3uf%dsJBscFw9<4>6~&cIXDHAFIL9$>n;4EIFliHzw(M+ zQ(LDF!mV3PU(`FH%~32J0E_%jMFS{0dG) zw8>+dSQS8eCJLgb@8_DnpWkr3Dp_h=uA(`_8$wC}y#Wi$wK;WHk3mwOEZA!yD?11lifQ1*6?4 z=xjcJOWzejLkQ4zf30ZfGetma4Z(lL$J^E%KEo+Js}?tJquPC{-CuT+9?Fky2XU)^ zf`JeJuB^fmY;Nnm<`!?@vmoZoO&uv%1wt&x9$NzLGWRRJ>M z)7*$xQ#NlM)|X~}-88BR9!8qt@_w<2<)g}YKery$op@CF_-#hjw#JO=+!5G`RL7wW z;f~iyo&%ORT00j}#@Y5vsH4!HX;&-wLUAJ^B&vn9AKz_ZlP!yBKWY!3$_6jLKC%-D zrJXy}UdNNd$D6HDRR(_>lO1KPpno0@USY zWqnHvXXJ@*4>4$wCqE?z+O20?SY&=f!dxLEC+i77Sie0dg2GdPyW&p-C94_-BwN|O z%NnV6XdvCp6g-(SOI@**Z)W^ zNKV<^r2d9$Shyj1S_v1Om$*{BfNZ$v67*ExpI+X4C?*BWPeiHa@X2uCYgK%xyNqJ& zb?_#aC_v!$O0IuUe3QRJF}J!yJ+3{dLr=qtsBQ+6{4LK-Y>W9oRuk@&Zp(jh24z)? zotUv#4)8h>Am>XLfkZF-K6ADe z>83KQc(2#lX5?vV+y4OCzxyGESX>7E13DR@Pa9zR%~PUL6I3#OzaW zc0)#}a616BXOH9;R1HX_KSKsIPib3!R&I^rG_wv(k-Ty>Tld&Z&Q^;Ij9rNRbFgP@R2=&h02(IjTXe1LB3$Aa!ni>3sXj~GTbx$S9WzQg%6yad4MmKSZfx) z)Pf~6KLYV!1aC}e(C_>UzWYlR!7@dN7+KhNjE0?XQ4F5SEXE#jcAY3CNbAOrvR~AF z0(QtWd~TMzE@^peN>?9a1K>d1F;6hZbk%>%ZPvAM(~ZA?F~Qd4e1tK9R!V&$=y%t~ zoF=+2wCnX!XD4@Gn9*Zvp+lsZbo{tDpJ{>3m>ZIQZb)>yOUI3ln+$yik4!VyCFyIX1hj36>nvCBTunkocY+1QBbds_;2yTD$ z3k%%V+}$-L3%hsTA>Rvp>C6F;UM@h<8EH zm64%pTdB=oQZK%77073>XVw$!^I2SKSl{BZk)a80PuVmWo@QqfUvejV#JCrk+B1Z|O%C@bn1>RWB24<;r znjUbm1_wdHL6og`er({8Fb#JnYK3QvOS{shG1~-g$v>?+#|VTL^QR&%gH8BGa;8_pwPP5uJq#Zqq>0@S2S>w^j#uNvYCYDAJ&E-(F-%Yhw+p+pQLGUjdLls3uZ@mq(VZO z)-fG7D59F8PJxLYB08b%`;a?`rSV+=n47h5fmSE@X9>O|h%0H~r8f|N+(!H@_YvrB zH{9?Qj?R9vn(TsuLlax! zLdCYxVy8BiiMt=(p^8LNFG*G7GWldzbLZe-vgIY%M9Q~X} z3>bq}?Jg{T9`FtqPa!8bZe-m)R1-x0XqKYq>QE_ToN&@GgkKP={S`2`+!%mEbz@Yi$4HcW4v1OAc3nWVuZijk29 zs9Rl+wFz*$57mtL8o_*k)Z+B=Bk(ImFE8A+iKiRbmXBa*nk6jFK(jO-i?7Ebn(+3# zA-nc}Q1pFQEub|=nN#A$x`Ew5j3Hd?+Qe^m-|aR(2$pju#TsPi(j8HhIxlu=K)TEs zjnB9|3)q3-VOJ8ydYA3X_&?@)IolnsAfN0Lx~Qhth#}NGK z0eJ$j!%p9X*Dgu-;O{eE7Yw$_^tuVeq2gD6g|neg=U`LR+=EOCpi*VtjB<3CfxDLZ>j+m(CbyU^ zmtzaaae5%%bjA{YG?P*=8Kh7iq>A#7%b5N#^}=F5l-WC&3LZ?mQ=4wTq` z?>Ml=Xl+sen{(J|>8+(Ta{vn*wzB1B+@?xAu#9)5pYePlJRf7nsY&Pxg1urqe;&gU z%ZY}1N;Gt;JJizxo?y&GBbfWz?j&oEqb7j@Yf(wGmscM{d6~6tinF$1Hx$qr0`kF^ zr>5B&`aodk2rS!Peqc*pUM(wI0>!|8glB1mUjfbp5Z7&f1;Zj6F(TBWkv<~K48i75 z*bxCXM59RF;KLZ*On@e@Enx)j{LO#mjAtzf0@H0l zhc7=*Liu=@oU>*|x0Z?|z6So-7!>l?e0YkO?3L41{&S!IJm5cT{O2M6F=4X#N6I;s ze;BesMYQO)Pq4ftriKfwkyHROz@{*;hW#V)HIhod$xieKrVaQhV}N*nIBpC~XdL|D z194*c)u`P(|9C1tOe|;onjrqwxkU5MM)!_Fd-E$F@7tcki1!^LZA=*0H<^UJ({G}eh;3|* zb`L0l1lbZJ)htX|Y#{4@8(M|l&soeERIG94Ac@4n-VE2^E`{7?r^hptK{MWa;|rXl zfcVz?^}!6@#EFLsbIU8VW#-Y@Mr= zit^ohB!ZN8Yfv)ZgO^xgD>}6Y`R2yePv=!X=ldOgZ%tHSF0l3F=QlqeReWqJl!zG5 z3aSo5qiidAy+tN#fEU+yI!@8IxXMG-$Q9hJX*r^~X z=mOX>#X{xUvR_T#M<1{k5lv^;_- zLhkSvyNTgm^6QwspR)gD$ExgWIF-tF88rgiPvmxgs&6HxvMB^*W7NmWBA3vD3rvaV znPXKXdOlPRzK64>UmB)3w@lB>L7;ZfJO$*ZMm|!TXr9h<=WL~tyff~<`pb9nr9D;y ziWN@J;xqG(J}VizJ5N=n27Tt!c+J}8Mo4%w&!&RLKFM}7R&x~&&5L=9*)4>!g_bX} zJ8QOo%Lw?_A_7L;Dz;ySDx*?FK7dB3>|bM$^ACM51$nIu$?Ap<$l*3RQ?40QL|>~L z3gYquxDGN%p_Miq2C*eZ*GShqL- zguL)8W_rBhk&gQX(_^*>>9`SLQND;)E|PJ7Zeng^)IfZH?I;wZPZ^hGGACdEtYBF_eOuqhBRvpoQn7vtwLmp73=fc!xm_!EdBx; z47Tvl9F~~*WQU&^ib4R5?KuZv5M zQAtYr%4#*|5V9357F^HcIen5NPT^6CKHQ8xlH9Xz@xja%lTnb7ia3SRB>~3C)EH(!8c%-F(m0te z`5WJgW$^x*(K2=-vz=yc<#8*c_;XU7wU;M`OS}|)3*Pt^#NVQSnbD=b zY&=3=i^oTgOBvP9=p%0bSrWbpCYmVS2~04bd^x&NdK!w#2Mf$-avz#-+5^B8Ju2x(7<}?V&gx}9pq!XS zqdhTY$C#F*XK1(gy(V z;4F%`8PJc}SK<4fiQ!?>Zy7^`r9McS%lAzj5mmStQLt>N@YoCa1G;vvq@n%`S6-9R zdvnFGi^wZWZ4{Oo&E=I%?+l*R`@OhN$-~OR>)3@CjQbSWO@5<)DbM4{J(J1Pe)Vz` z+;8NaCHC27$D{fvEKp!bg0I68$eE(~W2DG%@zT&5KyqL!L*lYRife>KR;QugCNpUT z_KMo(Ha`F$yrP$Ire7Van z<*Q-u@u;1g$B45oZ8qj`E>~pJ7OKMK3-C8$ zQwUEz*uRe=3?e>hd>eR;5b`D!`%tRIeMPZ#XhCYQ4K07^Z4nEy7`xDd7tSyo5z(Wz zYa}MsO#scI1MEdH)RMp{m*arrqAY)LQI-L@R~K8$32iNZUE`dkW%QiuTWn>SYmNge<{G5lZmYYE zjkY#YEwpJh=Bt5mxvfEJ7=mb}m&1!IaQ}R^68lw3#H?GpCWSK3^0?DYj}ha^5)yVz%_+ z1JSiuKoVzJQq#|DhL2eRR|!rYv@9;vF2q&AT*)ov22=G!Pt}vJbb(eQW^PCG?-g1B zQ$CEgv}n#^yxFfBzYJFF3y&46@kK%~AU#B9md|s48%KD9E$PM@9pgfF!?69fmB0Pw zS_w zuKmX~sNxY-F+zh?`o?Uth~=6^t%efNjwo9w+0!uN(?t&;N5n~yI=H7KQCOhkUP%805CxwfQ!>9 z3%}4QaN}p@IQ#;FL=@@>;d0{wb7jLtNp0`|m4IE=oioIUBO^Z=zwHmsVJ-~$m>IJ@ z`y?<(I}Q#i9rld!R_SYS^*_L-@Xu& z(Qgb9nw4=Q^P{omK{$T<4L8%^?#1p)xMORgX7jJlFF*Ijp(@kGEQ*a*=zJz4wXm?X z)O@rOxur{hyEN0N=A%{G&6Es#eb$snf2_0bgNJJQXWK2&0WOoTI?-=ppMsOgzywny85j>xU6M@;DlD^N*AB5W>>(la2$$1DAc7uM40Uxr%fO< zDU`%9D;SA_{9wT20oT5Dxuz2%vtVCCMCZzuF@CVC9&HonqoQSFX zWhM!Su#q>T2u;8WTjK^m@J$%272zt{8u>PV@be3`Jd5^kHcZE^46!7PYRD{qdI7#H zI8wDi+pPvFw@dx=nOpn%(dAkj(HyX3kaMlgNBe9b&X>6?!n+CzgtutlIVn?n2EA0O zyQlgx__9O~C2psY1}dlU32LAs)JrIB+E$L8%|i#Plk?{A)Bw(7E=8+TRunvioG(v1 zS;4Lk2Zd~2<&JGsNlp{wqT$! zPe%=myv>Avr+01~kKXB%*bV?;nYXnjiZU2@Dh_<4{&;#@AO>(m2K6ed8v*zg2?EGL zLJ*$1d*$&2X5{gXUL|<~B{jlA3Rz-8iZmv(97g1pscxPTIDIsaJY58T%6?)4iIm4X zDK8-M9-YhgMsZ34(>beD#dx;Nf#iWbz<3I$T+ZzlMo}pm1jRbL@EuySzo~bWP1(L3 zGO)0g+*p+9DI~%Eu_Qu1aonDnqaHg^=KP)U4Kk~IdjL*|k7>wu9nC9IjL2v~(Jr#G zi5l+&qxOTK?9a(+9P(^`p*S_44W^8fbI4H%5^5UnE*kBOB=;Y*X?}C&w8T#I&jH(# zV!@MjXa|$`d$lj?AQgw9^Ot!R2YYoGBF)2EW(w(Y{Yw7&cYCSD0}W@?SP>NQpR=Nh zViGv&!aM>d7$LYEbXFnmr;geYT({}Q4LD2K?$RYY0GzUZg~!Xw^8e0la$?EFisgO)d5fsA2qTj zHPc!Hpr#@v2=J_6P5p1DyTO)fQyhLIWEb|wipC-5+y>2QG>b&^H(GaK2iw~PV1XZb zqnV8=sojRFY2%20U0C41(Lv8)k|VyuU>lSqFbmKrR7M^jW;}Vpz|%+~(X_(}T0?df zi3Yb(cN~M?TOKhI4e+8FfX0K>`lG+zm9798RADnm*Qk2||kUBF&-5mSO?AqzqSR# zxmlaIZcMZtHVika0via(V#pYpC{zrDjHwap^j1SWZ!oN)Cr4h7#67|Epp;9@6uiEYjv$Wi;^z@eeA2&v zO^!gS{Cj^D|Dwxyb$>0L>?LRfgqEoOsIi;;_S29*J z%0Jghho7RI>&g0R`UW+Qq?)zk^aZPVVE^8izxVH_ulEv@9X0*6!{U!Voh65R#|fJG zX^*9U9G&kaZ`P4;A9^oVihQW_6=ME1Q8+i+^ayD`?;T$!AAQP$Rpdp=z^CBhe5+<> z?9+L>4PMwj87=k8^TDuvKAtXKzJEVFAB+d@-=|_Y#WM>^0SE3iN>j=_;!1kjFZn@*>RK+LBNi>r@a+}(i8xjVtx?=dr!K6YWj0_X$qUIA1z7GV}2 zw4Jb(Ytm3Yxd7sM^T@dzVj*JB6|3@pqvLcDZVclDcdv;F(h$wic(XJoZ49B!Au6GOf{GKX#Xo=?EoTI)b6S5E9JYE&0HA)shUnO>2ESHedO@ z_6=e~J4XiwCjS`D2bM^a_qgvWFCRnt2e{bq6z$tH;*lF$f2JbjRui13d9BZXR=fy@ zD+@UoW4kx`a1iL8Fsu!@Rh@*GHoj7v69<4t$b92N%4}437xAD@>o_MCww%GtrNwW% zG8VmBkAJnl5tG3y;tyrR948$46zg%E1Ruf-?Gl>B!~Qrx@gtum-j& z%pR`_g{8|85Bg@|r>exazWHFa`Agi0BbOA~)s7R5N%EI&jUn;kraec~p5sVB zxV+~kSY9s67M!`a+UZt*yMDCQW)?3JN8`VAt89&SKaLvydR%L_F*>+$qa7h675v`P zr^tBp4xLv*x;dTpPBHMCA=Us3N4Zwl54|(4X*--*Vt9lBTFj7y99e{d&RhAclN|U9fcjHH| zy_4tD#n)pQNBFL37Sxe`^%C2-J~Z#rUfp&R3(udTJh(zuaozCeAJ9 zcq(^X@{vcO9a#>4U!XX6VNrOG5mY;*j6#mWxVgnu)_B0yuto6L)Xsz`QjQG9_<|LQ zI|d$+FZX9i2iv%uzv?XUX<`kApw0l{lgU}xT!vM!yu>d+X(O6*4+7Vk3M7;TR4AVKK;>rJco# zGf3!+1u~Bj&z5x-zu=8Ms zDQX0NZ{VDnM6g}N&Vp5&G>I}meYw3yLDa*w)gRXv78R*VgW%rmh#S$@}$GsH0btD5@XZ9cWm7cu@T>4-!}NDny8YjqM>K zp-q7ifhu7oRIv<@U(iZsmG((ANWN|`yqI@p;(m|#hoOI1@Ae5Nr~M_8Z3@mEv2N9W ze)!{$#iP5+Kcvh5^=SEDm&^bCxO%m)_-~8#^pDo%O7i3N)t93`TFvE8$9Q^pUA>a0 z3qIXnxvp;faH8mTD(k~5PtM!JE|lww!>#1EEww}Z=H(sSBAw0L*tnl&Y484;idU=O zLS+p>EU91>Zr1$q(@$JHIY0mFL2DlE!>t4L~< zE|(yv=K~}jTk*|hb21*#T@&6)&tMAe9?}2t09OY90r<3E00L_gJm4lFlkNb-F zxJba=9s;8%_&Y!HNw1x!gQQ{}xB1Y*m$>+=1o`(_|I;9CW>bF!q^?bMCL{7WY?CCB zn;jDN+B$(A+N_e2eqS|?5HVtZlo#I60pIq~tr^JqsJ$@0L6olCl929Z359Q?@C@cT zt3>y6f_9f~p_%abQo5-TpV)ZkhaP3n-4MsE!JzzJ%@G1l(5S9 zlwDU44wNsf7ljL}D-5e7rvOZhS6K^kY^<^}OPO>tekYxM;#MVp8yK%iq(D{odL5#1 z#{%?PxHSF}@d3`rtj)-{MXrC(EPHmw^HI#u&IKX-<7uZ0Z1+ADMz)Lt6atPFDvYBu zXz@ho0?h-TaheYL09a2M-+%`qg4}Skm?Va8H5(BX9z8?4ZFqtRmImwI6yDIXf}8C} z&w_v$_n2qnqs^>;5<0j~TwN8vSB?syjL0@{+v0Oa@sTmzGmK7bak20&e%_K=B(Gr0qzUF zKrou077wgoOyt%1$#-v>qkG;$pXIT6oZZPOJ$I-rS>%5aV(N=!L;Z zb`hZxD79amW9)prbu{i?Fk?(FMepIYZeGy_HJ2!)^I!W%3oIdO>+({rF=u|RAQJbN zKR;`rd7aL&LqBkpJsZ2Va;BtyLJDV)tXb_M2>1*UtW7l-i`w0lWY|xp{p4*wxxgwj z1icLti-UH5kZZJS=*u=l8%}D`_EaI9QPx|?r#X^0DdE7Zq?af|({_CmijQ;?cCl!O zW8ea3lr#9s6@V-F5IoS`3;wf7$nCe}`QG-iHlVEY3}3cT7r)E5Ik>mPY1SttPAvR7 zXJ4Ee(+$VO`NojsrLDQXgb^-?x_@yF9CT@jwe0K_+{@ebeX`gmi zLulK7Mi3x3Y4U^XfU$eYY`};^(dX`CN6x~L$kZb@_grV_!m_umwTmdfuo)oZDur>| z)I=W8$b|-zv*^(g1*paW@7N5L#I`oYMq}EMYpW^R2AbyvY1hWakaMRR>l$iwfOwT_ zDRwbuYKb)l^8*hih0(wTKoDZ=@Zo?vP?n*8;DxwT1|^D87ag$f>gB+kC1Nc33@$O) zR`~xJWGy&K!aF1?sbLdxKQXM7dx$x|?I;D7U0+=yYss*Ehcb>-asJGMY{V2{YQP-{ zm#?^1pQg@CqxqC7oM;dOiZL%Z8PUo7ENLrDUd;Rn~MbvEK_oEy2Xu431(L{`dd!zeaah3WQGxez$>}jhg{~S>fht!PToh7X5Jl{La{KtI>)45-s`g9t)}L zkfC?_C)=I#5g-cGiN__QtWh|hl+oSfQ#Vh%pvQKCTy&_xLzGzD+t`Eyl*m&HUk^Osg2$vH?>pEw|(h_p5#)$UX z2<9JlU4mXkr1Sir7CAZRMNV6Py5t9JMhU!Ap_Ep8I6lqo)VYyyT#WkyJce4TPoiZI z2^RLdlk*`Z59V4IZBv`KYq77ptc{dfy@XnG?A2c@^_6rrGK9EsL$lNmBZSb}`Sj-x zb&)yXxI1W*=T(gE-EC|UVE&zfzHt`8DaW&sWFUfpA|^L~l#{~cK7g}-Gk<&Ftwxz1 zlnlB(fZaM>DjQ~y=llZ>hAg+G#xO6<@lz-1Qv=SE-;OD6J>k+6E%}m$sZCB5?${&i zx^nksl(DeDD~Q(S1S1-*bR%$C9K{zE06@pjF;KH6OWi`fTwreYjW*y26ZxB$3AE#H zUZxn~#AV);nmTOyZ{?Y)2I2f4x z#Q}+BcLYF9-+WU1Lb%P9rfec!`EF6!lsl{2God(A<=>%73gjzc;wxCOKIqm?>LXI38S?v}LQih)y< z4=WHXMXM@;NOj(SbZoZWtp-%Djfv-8hUQ*%%otr}j(6L%o3ECVu~!U(+2;OfJ&X$6 z9S_pV(%c%iK!na)*@(S#Ol2+jQW(G8>^HoX1IYo*xbXlY6K#H3&#H?0d@omJr%zEY z+0shU2mOBb0|kANgk^`XtbSLjmfghs#Gu4QZs&>&wmQXsoj%Qd;q7>YV}S!$@rV7~ zXo>h(x8`Loi;4YQFcu$j&9GqZx!xw&@a0*3{Veyj2_v2VxVSIRje0jz0^$0c7ToSX zS^b5TE*tkSBXUMgE9@<)tWi>#kz5y*0%tfh!FaM27fEq%RG>1!D*ohEC;N%D)GEnO zwl2;uOkdG|Tz<)Cjo=%#bt`tI04pFWO4v6mzg*%Ti0_tFS9Yy2xhevh{n&Qen@Z=@ z&M+mXsdVfc(np+376JS19opRWqd0EbFk4C%v-dG|9;Zc8 zXBg9ap)(OJoXoj-r`4fM5pBIA0`xnJD8!4dlRLT)Ex8VY>pPv1l~;?QaZ$UR;bVgO z{y{CUW$AWpU5ZDHY$XMN$3;^BOj02xENnUZ3EjjY2ErvlfLSO@9zljlI3U=7I9n7n z!DQ)wbw0u*9=DBPr&58&9Tp@R>lOb6P3U<&X$?O2vmZgfqqvTY!-YF4x^1Hu8aGdV z6BH_aX2Qdn7OI{0h=%Weljy>Ie}LY7-6ibbN2WQ#J^{*1<( zePd2xxJLno1%^jU@I}d0f*lciB>dWc;;26bbB2gbddf~QN@?ZaDofbeKlQQT04CE) zxJHOW=!tZ2~a` zJyD}5+O_si+UM{>dez`5a3Z7g zkE!f&;OrpmC00EcLm5y4^9bwb{l~+z*}+VJGZD+#SFU6rx|c1OP7mCFkymd&+4tj zHPF2U9~F4C;!R0Yl3npZ!S3|y)UMyb1zc^bH)((FJ{9e&?OS)@&;NU*Ek{~J+y6ln z9UrBNo@ea*GpYJhcAi~-R{~l82asyq4MnIkx-oOEpLpLpkI&||WjF-Z&*)v3eu75< zw`tH7@r;z~HI)`3v!NB{V9;$h!9d!g0L98}o##+vqX&Efzw5ewNt)@J0Hapwjan~V zY|OZ%&ti+MC*8F>(jTh%O+Lw_p$H521d4iyCJIs~n#X17K}C z>6RZehl>T{O&AA6CW;BK#(~KiH%EUiQ*yQp8`vJMXdKjmk^l`JxlVcs5Ql#N3x*C* z=RkCD;xrRLD+bqoNi$tw@{pDd)|J16c|_R4`P0ZKnK)FP?55|(k+=bp3Tzj5`|T4d z!5`9$sfiZ13Yv<4wA(dx{)M#LiKc86CZT!T%P3Zg4-I=)5G;+n$jMQ$EXD+05J@NC z>O=Hcg7zybl)j%6h?Yxg~3`MI#{&=Ew0*V*=MY{fR~n%%MTwwBpM-l zy)x(mKw=V$0is=8L`?k>+0x7dY3rukQ28GEwOr74an%{Gv5fre*0nspX$PHn{~P!6 zZKi&IPy#c5-l3u3&@|^GB%&E8_k25{D`-NoGF#k;#crsKHP#9`@ngBbJ8TBMhTegy*j27YkQ0dMM!5N zLwX)lT8yHgygl*d0BQ$Eq>V6@8#vi;TLB(tqGF7<9u55F*}&%Nhre|UBoI2MuxLEl zsy3$dYcaLg5L}5x@01SkNDu8W=7bt3XeXGam<1s5In4k!k{!l{(nHK7L9OHb?zs8& z0i+9m=C@q(Z6|k9GP%HeC621BsKr1SV5M=Rl8!4m9? zB|eAf?Y<5lSs`DhEsOE2M+LA%$eInr7eJ(c1&2glP%NtrfFW!(z|L4GzwN?*j_{DX z8&x7L=JU!1S6mQqu}g|VNbFou%8dt1sqe`_n@ z{UY5Kh+z7|KLfr|!z!Ke;utI2ooYWI`zTJ1s7UaPGo-L=}o*?mJ>qRp+EkV@^bCl)OC5uORI4T%fF z3}B`5PSQah;X*KVSiT{49tATU%N!j63F{a-b6mhI8Mrq_Zm41j9ZJOOgWb*<^6X~?Iq+b6$#86rYIOwc=6 zApE@TK9>eE*RRcCSu2f72Q5v1*wAcCySbwF+teWz4Op|)DjcsfKwp{ONYJDK?6XEA z!Nkb*W@i^{KilHEvZrQnvvWd6urN*)lTry;jNwGc8L|qGwJDBLDnILdi*5BM%Ys#{ zFBPt)<}+?RzjxZ7osq_+#eWz83OW02+!gmF!x1x(ZML7IW7CrWV{x5-9PZY-wa1$< zL$GrO$Wt7Edp+W>A^3}h<@kB)npdUEKikt$`W_Fa+sU}W&x6f_RcXo*M!ADsm|);t z5R0Q%2f(cN-2Bg5##E;3f^+}G;M{I?IinfruN%apiw4?=w5QK-b?g+WD88-N6=ZheJfp#~v;ia1HRm)E0 zht3HoxRMTPd+Q0_D|kR7ATR^#yJ)PIY!6fJ(AD7aO3j@$^O^K{Y{94pXM#2SgfS1) ztPt!$psdTD;7%t{j(%Bo7LT4FGC+WmL;^MqN%0#v>P9C_&=tRlK$2SvR8Sx0)~H?H z2-C!zdQTvmkzdk(SfGp8D+y*iJT?umJOGXj1b*=iQ~Tl2?1w!I8VIODzcVz#0V3_s z%tmHAldAQUNJS>66<3@2Fs`!p!3Ocq6e5>9uzXSmuc8po0U-MM{@!ndhi*EO0(qfA zCc_ZH?hA7}`yM#+RUp=3W7(T7k_Hjs2EcWctr{P4jsgOI?MOPf>UcQ9X(&~&0nhgL za-GhfiamQOHq<#HMdULC;t7Z2jD8&fNYk8;(yK+&EzIQAmHCOlwgvs0<~tV-&>PT< z8bZ_+%FhsyAwpOWnPYU4>%N?`goAb&r2C#hx=!Bha%;Lg8{sYp#Hqn(G=-%S4wbMm z38=08;;$`#pkn}&9ar+ZccFu^r&Avd#k4} zw^(7#Bs>&ak?Q54T0h)fd}pj0t3aoV%0gX;!XibeT;Uq~U4{`U)aL*D|NP$-sPUa# z`7uPj*01sF3_r?kOu%YW7ovOt#@jkDePt8I^V}hSz2yUT$i*%Khr)=#8p4ce2&0@S zP=GRN{MHsrlnT0>j%Q#aX(5XoS6AO{iCDcTk07w&0NfVVIh(`2tvGWYn7NayEzE|9 zC>(d>K3%0aswA65;>3giE6fYKVG%SsUmoEzq|#oOA?Wwx!DFAuYff84ECJ^wf(?cDQpK7F&D)_f|S~sC2VS zOVdZSLv(0IDGzr_^-ZEMlq&ZfQE}lq@_`NGVhb?04q1CYyu(V~FA^*4gYe_}Zf+ko z4kls4rNyaT3DcZO7^zOa0g>Q}Aw-edCiYc-^AR*knxAMQVL{8@cI<;GDzYAnbu;_b z`hI$}Py1XmB^R~cQ7?9!yI5GbC{O|+O{*6jmF?Fqg77o^R7LJDEbM1&efFKL53#(} zM>A(O$!5f52Et5n%(Lf;;9WSNU5?EI!k5RHEbHj9 z#H*uQt#^4erG5UJ`O<;gy9647`y}8b>)V zq(WTC5Tr4b8a1=4>W1Y^?2O>}M59qRK^sT(X6dqC#tp=RVfC&;$2JnuDmWg0N`;{s z9q>iMdpaIcX?wJTP6B025uEYg^fqgzU$2iS44iSfB zVd3ymF1No`?$^S*qeHoWFznO@i~H<2Xd>D1-LSpb**J9bxx=vIVIlD`JzQ#fnc`GC zg<)vIg30gGZu+@$T!BX)x0CL1Y7dI~)_RJtzRI}{8)mdZ5d5Nft5xD6x?)X0c#Kf- zDI-)+*G#n@SV*oxMAQ($aHaH(INcX=+{uc}taBRGi2qqNM^;=y;lq3ssiCq!6 z!CsM-_O_LNh71N=h%~2dGwb98`QXeD4xHwwiD@QPI=__8_Yq`*VyPgU~4Sz(<|qFHX0(&973ql5P|1V zCJMJiH5a%>He7{6TOo`D27ZifFy--~Ie)?nMCrha_L~AOP%Mgh2cD%I;=cF>cag~n z!q*ih2gxu*FdVUHV{u#erN6Bmk^(NSYrSN*hAZTSzh;Dg<&Va*rBM0rdfD^=tqRS9 zYTl&(CqiBJpPC+BU7c=MZ8tLEsojJa`<=oj%IHPLdQ)|~nLZk3cPwmA-&5=|JetCp zQ3h^DMgS%mNFxH3$1oL#Vd9a-8_o`9U?=8b#0tYWM!eZ|Ke3I^{L0Veebpn#z`SKkt75Sst5o0tjAOqcIy5 zP#P}{(%M`aBZ-<8k~B#~Fj>F-yl>UG)R>tGwB74}Jo|ahUaMthjydj^8Z~NEz4h3& zPedge6onp!9WaMoG5}XVsK1C&cPI23f*e24w#eXc^kgSWqA)~5`CM0 z!!#yq(hQkf$N*qk|sS6fV;IS~92!SxPSbNN{K8Ct}R&T-6*a9-3wpDRdNz=MAvPmi3P0Z6bWz5VS;PL27{dDHl^YXJz!{+l? zM~?-Ye|{%^_5A*`P57>2leCvJw*(<$vUvcn5g@;=v|% zgbVh@z+``MP#(;xFi~9D>G<#B82gqAf2>{nU=Njf7eT*ykYMwnEIv-{ExFaKK}LWV zNMqKkP|9uL3Zu3QDhXT{zQaKj-)XLeZ1pPhgjbI0S_qDD(0iCvxFGa1?g3^T%p~%n zcj-;vdwv_0Up*HZb-q?5U2+1Or_ZZ@4(86+tRU{Kqiz1Q9QzJa8*|1;RD(54e;`#L zJ7!f;-sw1d>W8>7!LKY~!`*Ujdfa%r0?DUri}FPvg19bI*`WCbE0EJV7O#FV&h~}tw1s(3=P)F5lr26Q9YSTxmqD(LS9OA=@@dx&#|(= zoEJ*C9PfxrLhd|Sp#Xx6T;O;Qf3jI;smB_sVhGOtzSqT_3#Rk#18-1$zc@G8+oiZg z2gAL%-Jh<58PLKXG_Z;(U3;^Jb7-f?-T>LJhpNGd%E)*j^)qt@oa&W3GcApBQjxr(vNC9h6vx8%!h&`~pQZ8a;;k688|LTXR%xso znx-4Tp2N$0Ix3j;e{^RO`7n$|nu8#f-})>HJbaqNvFY|` zA)7OHj!yhaqBI-N!rrWo4F{J%{VSl7EZ`gw4eqkD!8Zt9C|NTN)D9MjqQZGpBC=IH zStoKX0o}4Qe?b;~x#7F3{vcY{=&*pLdKUf|P+xUzJ1Rn$AWP4eaUq+RhmRZ!^OoHgbA|`ETa(a1Na+$WVuKkA8N45{Kk@_ zR&lIK%Ucx2s0$)41*vG%H^{IZuRlT85R%1DIlKqi7Q}RYNac$8@Nppq?rxuH3hue>zu0EH7-7Gs}R~i&9-?JE?PEQynCiGd@kJpH4FME>BxkhU^TTT zcuaBEWiwD{Mdpo}#hC!L4PmiOvT)eB#R zE^4Rw_6D>x>i{Wrml^d?3~~Kast-smEG<|LV|UTrVZVEcu3IB4pAp5%+SH z<$?Y(1-P#J0Qi5JYuk$`OVHf6uorv)5u~G5(EJvB%wHv`Kh?RkUa@eLMoYG?r>#ul z&_2OrliP?8R00cJ(m>#Qe~{IP--!6(sto&ROQ?fXt~ClqTk+B^l5lg{!(rSEsrwqn zGzJQWwm6lS-4Gj9(1^qB7lU*pRL#NaaXybApX4#an!=z$o|U(#L+`!s-;i!LkW6 z33rQK^uA7I@>rPdf2V#)N#H9*O}9byK74?KV=t+h?h~%D^CWqYFLcn97JfS33Pt| zD^vLPr_bj=seCm<#px4oBoPEhBtx#vlFnZ7Q7loLyY_ree{53R_O4yzh9r~$i4hbg z8CI-pgN^j+)-eXwjN3|VB2OtQ{VNs()l4lV4Ib%e>V3XVZU|fuP|1szk7cI9^a0sx zQh48nF>^XB*<2K=O4Vio?Go&;e2@rj`P=(;_qze4IqDBWT{HOWnv5cAcFM_&h+0N= zD)Y)AJKLT6f7(|sx1YU!_UkH+Icv{eJbV3{#2m@==H6h^h5N{6^)YT*C_^zz0n+@q z6g^cq#AwQA92|+?;PxD7u3!%U(K*7#q!ZFk^<)F|WtSx{P2wg;PQfs0AB*M}zKiH7 z26hHMS>S4#M*SRikChDf7EzeR8^qjYLr)bkFFQ^u$bM& zzpkH!)Q5B(d0qvA6LN}jpL3+T|6z@Ke_glRu2;J55LI0-BeC2q$LcWz3aC;{t&Y4a zy&A8mVHD@u24&2V@9b@;;)F9K`GcSwFtM|l(REh z4T+`=c(BmMED4!HdQvRwSdr<)9!Aw ze|A8#oNRR&$yXPlS=BGUSDmm^p)4@CubS%aVv>+Q-`kQwhMi_+dcRTV9uI)m(IXE! z5QUHEp+@cKX5J6_V49r>7xG;dSORht*_CI`KDCZM(XkHb#CxPV#{mHAuwVb& zthh)p#5?b{xQ(9U%nfO*9^GC2-XYJ%e@qYI{T61E`{=6*zdkIVLo@@H6}81@n4wVR z16X^S1!4Dm0y&@gm#Q*gF~CH$_x;FA7x0Nw3h|mtd4uWrl#>VE#Kd3}QmgAeDZUQv z^Z~8U5bTRmYS)g5qsnOq6*RB50_Ca5vF;k6S>bVZ50s}tle*PicRPwNU4-Exf61wm z?hQB@Pt08Q22{KTB1X(PBdkLy&e52+IE{1*&OsDiR`F=Zsnfx1!O4!>VbX=5dH-Dg zp1vj}fc}LN#@CkjWrbA`^5c4)e^Elo<-=_j=fC~-V+&*j_|@8Q_$0k7lGlGwdho{@r0M;Bw^vzQ{D^rtc+Xva z@f6^Gr=S1m;Rmiw=J`TbVT(XKe7^|Sn(am0sK7tVap&X02>?GoZt@Ji4ZHnu-z@^p z{&KCl`1_ocR2#NVkMLJLe{>GNo^-p#Ym2S&<2Asg&@mZs8G8L!ZkrM7cT}hI1sDl~ zj~~zH36MD7YKKbA>#~Hhvrj!%h40Sgg?NV0#NumWUV=G6^;fgO{_o9WvzaFvy$J)y z0w+Qevg%AQ(#)F;PX5ruzmb2l;Y#SmB4X@+pp+OmUJx|e>)qz}e|EEz1-fkI+^X69 z%jM%DI4c%0vK&jzVKPZ|Zu*pLrX-v1cc$k?;LO($j$n=0mM|E10}KvPJr`ulb73q- z*H4JVF4Q1DtwTm|WV9_nQLj2KH{Wo4Q3xrRcA5APqXDW17n!!`5X!>2jmRCPE7`n< zY&7R24-JwGyn&G>fA?>h#A6lCsckrmJtKDM&MCk@;Zc!=dp=# z_-k1J73zkV+}T}TaaOjRO#(weKz#5yM|+|`~q|&s{4x415VVc))7-@r2ZLF|xCrLbGB$D%Vn@uZY7yL~0{aBw z4lU1J5Nf9N2cF;70J+P8Dz6s&PuP<9sRPSuSR>-n{8=0Q*lKL|&(*x-kKR|Lz+e9A z=}#CGqe4Fua}zQ6o1{vDHa!BU*;_ zx}r<9g-g#$4Bp#1$OT(+7AfT|9&=)$!TeJW89OpTc6c+t`kbeVbat#hFYjPxq=DPk zN^>XZ6FsTg4|_WYzEhO1MP*&Up4@&*Cb+NsVq-PoZHmkbunRJOyg2;(~@e{{=R z7P?_0Lnl3|0oEb$)lEPR0VI+k;9m$JkqiM)5DdZ)IVBE<)P!FIx%tf6ryw@A1qLG=xy+h`>%zR_~|4sfkN9 z6<|I2UT)EUt7?!&INj*{BC9`l<(he2!q}PI4V>q*z2W};x6xd2Z~x%;Yo#A+3kURp zw!eSvdZ8F&&1>M-<>bV(BhJL4WWGhvm^lmu%&!)dCea@-O;A^@}uGd#5I<@;Ym00N1rQHd^!Ps zHn7c*`;+Q9FABFnAkE}|l5EpYs!yT@Z4L>4GIE_+EF&SBf-#7#*eESk=GHOyL6bl| zCs+-;#<@ki4myU-FqL#Y0g7EfJ9tv8JPG+dA)~QT+A^LM(4(l%Y@U0+&aW(S_jSIFVvQGHG}IY~@$x(gpa!ESd9%mptl=0IOQ; z7mrt5rj=iSPLu|%Qhrf6UeAUAs}MhE>|i1ccJ{giH6hXYiWOfeqn$A>;O2HALQ#^3~99izN#7< zmHv7e;>Kqtq=`1!Y@sC=vVHSUWLPm7c6%|y{r$x&WCcNS=#gJ!&Z)8(N7>T^oi5@* zvLd#5I42R(GjWY*g>c8~e<35a3D^9nJ1>JF>M_tQH)wP=`J-45pB$6os;O7%K+Xp`uA;2v6 zHx9~shWZrdY7eTvgGuV)U-#RqkvxD6D_QKHm5|AQ<2Pq!X$}SLFSwUtsR-KfaX_|r z1Gwwolp_MFr=c>@eKTQMBS7JXM=K*rp&Bm3`jDH#>OIv%Ui|%R5nfoyMXA{9Icp&B7{jbS{IP{)f+1Qwe{_{iKADs&f7+AWShr-8*BbXDxT0Yd!4UFCLeS zSs6?md6e`4fAX3wr3}+7Hz=qpoL@v%E917E=S}d%7J|neLFqQcfgZRJHM@K|z;qi9 zUlI3|OOJ4mJc`1GZ}jEfLbuucTz$V`bW%3rPU_f$j~`Av=&$i;RNpA)eh^$}=Qrgj ziSwAie2uXn7k9AX`Gx%QPOng2XS}>mlgc~Uuna`ce>6Lff!Is8-#x>SIhG6BD26^7 zz(AOQk2Pj@BgZw?mwUEGb(!ts8+Z;rAY{ZI@sC@%)Mx6Re#F+QU4uB|fNEzv!zQaN zn7q*1fwsVx?5_$&Mmtwl0(y;@>7jDf7tr&`-DwF_XzV?LMxRAgHBzwb<=6PuFl`f(|ov0$E3nno3c8T=1d79I)ud?P)jLOhHMNa=`8JSHz3YV`;xKlg8x6E^wR4Q)E()WA!a_Lo^7Q_>Nht+DM=Q_6?vsYE zC%eR(OLmEWr@MsNBr=D!^NnPOaLyBdPBwt&6Uduu$YkoiK7~B|d?oS#t~^Zgq(2=0 zLpUOxn*aWHod0$5Jz-+Ka&4#EoBT;%fA^FA-*lRQ)HYK%_d%F3Q8&vS@gquvG&abJ zovK=9ep??!xXgavZ0i@=l z>=v}y49bGPt3!edpJ3BRK8A{$U%3Oq9|2S;gb*8Y^MMI4kk-GVIry-*soqdreoEY&~QESwi`+RUExlsn~XLR7V*=I_vOp zA$F!7>B(G*Mg<(LCRJ_PUbAoa!2r%?iR$!uQ}pQ^tc33thC1HaMt(bzf;@gMI*-=5 zNxYB+6S(_N?CG04a{_^_dhcEhf482Lq#y|QI28si-MCwmKz&=fg=Xt#=Z0mhCsiuE zRCx{qCm8>Jom(n5P~iN9-`6+8aQWW8D8X?G?;)_5(2J!_Z2WoSBRFzCpVD^8=TzRt zknHlj#7A|TQ0w~a^0_stdpYe=nYRFN?qS~Ha789#^k5U`Wx$(?y~iQOf6U|%-#2|u z)uz1e2Jju~zzahs;YXKmxMTkGS-4Y`ew(|H^dfMVk)zxl15ra(GojaZAS2f?-1$GS z(n-H9E|(T=7g=l)IFCSJd1T`<;ufv})9itk3;L<`+@=kz8nswM{S- zsSln592b)YkDT|QG=(mL&EUZ~Ub4;Luw6z-C4LN|^K}gZs;b3;P3W`(z6awFE;5bM zp(s{ny>2D`wNjJkNDU9|k-Gk)a6RhWcTy!f_OYjCyv% zstWOZ4HowxeDy-A_JZ;-Mc3c1a_{!{0VN!-UNX{~cxmswwZ{v08mc<*Cb|Co%Xj*{ z!TWG^Ze&uptVa73XN$t;38Za?~oBg^jT z;jvycVm^Tzg09{5>x-0IQ)=xx&&A-BTl{@*zdv*EBZSd=5@mR;IK<2ELw&_nKRGDW)qm_ z-3*#vf48-aiFN&$Ls(FM!M%1(6XiRw*W6UzLF;7wwEm^Ja&$yxoJ!#}IR2(I zNJax45$^T_fnvRRjg2=bK;V`?r8m9ub|fjg^oQq8Y&_SFodX)+$=%)=7$MAR!bJjh z>Lv)(AT4=~-GXvuhM&NpfzU!ggUZt8d>F4XOaXkte@#TV_N_j82gkJ~pW2hrD+<-w}v1l29I}(M-ib()oA{{D9~VAh#aR5n-$zvZv-$i@A+e z3l!xl)$+<{Ay@92Wm@3Gpk9fAwrF6L4KQ>kacXU6c7yDxJPJCYx{|kEWJ?Z&VrZ|4Kw5GNcIiY+NgbfUL zI$^_sNguR0TTpK<{22F~AN`&KsPoI$kk4bvGV)Q9mGq-qNQe0g>BDa1y$jd>^_6IX ze*rIL=}yK=P^oa=BJja2oz63%U!%K`Fg{w4!Q?)PHy<{KXNwf zz%uF&$OP_yoVzkGn)omG1dWDCe{!wD%)yO-u#;6Y*Z$Yt`_Wb!D7sQy(pQnf4pJ_k8FjfJF5R{uM=M(B_j zrjh3xVR!!j;U1ww^FNk*e>C}1(LwlG?8(|65G-<6!{OPdIHQ%X^gCh>RcZ1PibNPpj7dyxjq7vGP!R&c$l|y6i3(G)ZH;t_uAjtV{-`*-4|dcNWZ3h z;26a)yGhVE(M)zdo6kKgFm0Or0k&38sV1>s6S9$(2bTNB`bc=$H->I28>YwR~Ts14Q z&#T0%vkj?!|KH~tQl??r9R6SK;R@Wt>I7xS`UGkIe|uD=No4IjQDn*gz#;Z>nIsnb z$uxna7Bwc0d{=TZ$@zSlonV!1S%6{{$>C@Wgq^CtFd`Zer=w(kYMxV z2b(89zN2|?$CBGTsmCR4WJT!rY6>RYJJ(q=bFM3kVgu=dwGjEndK4Y3a-&6UWUzfTs1USO-6Jf!RKWNQAHU6%kq>C2xGPIaK0h%CCc@;k$a97Hm zxbuHWn9hu*Rh-|)hjf0rDE-wm-4L3tI;{}xmB&gu95qE7wU=rbG-4-^K?)=_eywwc zFA#D-y@z3`AfD1!@hq*GZ$g{|m@kc=4fFsEe<4@nzcB(SB%GwNJ%^Tn^!soqKd_MC zKbFQL(V;!)TzsKL7AvMY@a`EE;F*W?&BQp%Am;ZKZr^9p(08qZ6(dgx?Hcvom55SL;vqN0s2g)#yu?5O!@vVXJ-abRKOCEgWD|NBT647ahm+U9^_yl! ze-r`x8$*+MjVA`U^}!B!BquFEeXy?H)7@wU_mP+e0K~~m=4w4}V|Ths5$r!ZA>o!P zn-R1KQbyN!hj5_1Zefp+46Hn%4)j!!3^v>6f7i75+zXmQTm`38#tIlKE{M**+|LRwLcsm_ zwn5bjctfHa((6A5SZ?IQ*4>46MY=gJHmTpvc8#7yx;~pW64lGsvxd z50IvH>`AmqXOS{&S#I(Hz z2vGHpK)s@C6T`=wD-bza=P=%JjSA-~T7{bZ$_7DE_S_{A?K9NUAYM6^gH4S2Y3!^5 z3_(T=iGVha7)?rK!V3pH0vMzxe~@zFr15p5_mv(vJfNV2Nx)Zv;`YS!joNV?F~d`X zSb02wa|-m~7nplkc+@cyU1LfgAlFGN8+aMP<;6mcqUlWee}8YIFT0I4 zE~o!P_tDoT-$$GIXsr8pvWo^OoXA6AwCz-HBK9PvehcIdLK3_Whc{)y<9_)svGF9B zn!{v{K9-Wq`kQhVIYHm#?#{=ql~K!OPIxxESu*!+0=4&ot(_;~Xf|Y&k@d+*5@pUj z>3gq(?K8N;4jSc^&5g3qf8NTIO@QV>QRL%BIcl&~wl>P15?7wygZ0lY`$ zk4=4fiiRPIf8FYwRq8w5SdX2=9iejMl2&>)kM??!hks=r?R9Hw*UHS>LovI6!>p|F zour4@nxDZV!7E_gGt8PIztbi+TBDrW|-h&{%HY#%ov^qOL6E>%(w&WYKy_N{G>7!UNLiOK%{ z9B_1o2Y@;$cL0VMfS|5yZmm$A&R64DvpTZo6MI%z!qoZ}x#={W7h*h!9D#G_n*K_S zZ|FKk17Eee%@xiL`gcfd3At-36j0_cl zH<9u>@bWLh2*Ujc!WdFMSG%diJwb@>E!}o~?Ct7cPE0CYM%#f_1Af&i>{w*IvFzS@ zmMHcbvkt+Ae>LAi=s5RtF4{>KS)1@)6g+Q(C)mWlErVLXii9qJxhrx}>a?y@D%y@! z+92LCT`Q3No0$o`SBrI^P4f*Sx~yN`(Tdeu1w!P$Rb5Y7?(I)##p`e*x54bkoi;qm_|I{F zKH0Hb+`yC)$j)~x*T4&WO0s{M8tqLZ(aQ#g%T*j(Zjee%YP(@dPZF|xSz%-~vw{dG zGYhg4e_s!1NOn5M%{t5lqRse~_M6~Eh)0hgY!jDh)JI6J>5Z?=bb3?4qg?m%xA&lx zl?QgJE$B}$*G|aNhyuPrV&_G8A{M5KhKH<95FiOdlz>MHoI|X;6Q98C)&j2Uh5uwT z;%{#W9)eWDw9pa13klTkSR4Sp4NeFuJ0Uf`f5c*)T+1OP1Qoplf zTm2_4e>W|*)y2k#x@QvtY6_%Elwap=W$oMLlGqFUjrv%f4HK*2IsbOqxcC%y72 zFC;R4Qo4$ACumQ+fTVp8Pn3bU{7wV9A4~HG!~I4HpmE@w#F}pA8?3ZO#YHH(;65?& zf7BKBjFS?4jzw6NNGA3O9a$zIRGG*cW%BK15-S5xwJR$z5HB27DV71=1`!#{L{%yi zG@~qxRGBmg7-deh4eZx`=X0}-Q(oom4hdHuXFlh&`^>Y%fn?wyH`t{T& z!qKRem*R~{F<>z`;Tx9cg(2Z`&gy_8;tcwDqk8MEQjL!{9xN^2ykG2e_CCUe(XXfF zk2kQK5CgvG@7D0VbGssM_XtFQU-y23IJL8P3wIX3p31M=;nyAi>s|=^36~&yf5^6T zcEH!A8@O*V?{|OF*CY3N>0W$~I4WbuZSfx_HoWEFE)jGvGJ9|d=-u1B#S5?5&s5yj zCM9sahVY}?pf}Ag;?1>&n|ElE97n-#f+Gq}Y|}qYXUM z>>_%xvP%>6Tbb|%-50{6mEj*_e@0Ak>yBK;fr!kV@00;{Tx8AV%WatKbGYz))siB6 zRNK$y4+*pdvoiT7FwFoBW~o z+IH4CVwAzwArZa%)Aj3oVn$)`Pp>ri_3s;e;LFNa9gpq~eo_txpKWt3e+WgwOYe*> z3B~y*rTgaeZt6ave|#B}I}cK4gXuG6#VCwsG!)5SH*=gsKLPcCPpn!nsExc%g#KUB zi{I0M#O)FSVWMi+8)1IMk`;Z)xCN+4IxnVb?EeEV*^ zQN>Pfan|X2mk!rn z2&MaSEDs<#@&P1A&;TK9lc(j0%t>2Kog2033);-dw_#2?9|F@?u_zVX1vm$v zSj8CF_Jf->Kw_56pwVk5R3n5WA_AU>l6(e!0iXG+jR^%>L>7N$e-ZtbR=7NAuYuP; zFH(?tK%Ywpa3In3Gk8Aj6cMq)FFINnrhyIBLz+ZVC!%ve~Uh6z|EUF$~SMh=Uw*%>DC19334P)7mr=d=WQ4Nwu^rUcvbRr z`QGzS?1?g)mgDw`v4P(J5@e}L7@N9|O}$v-tl++oG-UF0lGcs2+;r%#VOn|4z{tAc zQpg8)T+hf8EtV%*$R~S5p6C&Iy6@}}KG`Gkbn)2}e6k1le`F7sCP8fjE>NU^yP0q- zA4!-jD}-Y#C1Z3opP}D6+HW29n-J=b>8n62H$7uc=dAlChYRY1^{lGLmY5q!fDWVJN}qbPDQ^h0v@@( z;V<@$JTcgOe{vAziN2C22A59^s&rlgdTO$rMSBwjQ~PaBN&vY9pGvj;*hIeDirsR> zk<*kXO7+!3`x0@!=$ooI--F#jAP(L`4j=vbdE@2(7)U^LDnf;_C;$;vVj4eStIdRp z-!w|jX4?ygaDh=L6o>l9jxJd734p$SS8}@OZ|cW5e|m8@CH zM2`1-5Ck}=F(V$@m?VgN~j;F2_PVbpG8XlOyaYh zGR;PPlMtU)pM(KHH^QQ}@!mxwp!DBoea$cu(Pyd&Qv|Z6VePw8u*cW*i>dk>r2Je z#%dQN3}?v5>t}jJK(U4RT%w|%FHqHnx{PB#ESbYuRSA9KHBy2HS2SmMJz4>+2 z1KGqnN{(0U4edI}^L$xHp9byEz66p+j5=*t;Gc7Pu_CUYI1IeeX1@d2RVR#e(=yy@ zxM>Ba7JUu8sgGOc8$58x8|Hs}(NDePe+~DUF=Fx}@l5kkbkZ3iBI|JweWE$wPpF#t zxTw9zi?JdN;#e+mCidHR5n#cm^OZ>GS#r3{B_UQ^a7F4Or}-;T7bDI}PT@lGvNFK@ zm36gZP2~LC+zd4l?8FtIVLx*Nt{-H589@MuU+>|)dtmiN@CUn9ZWK{2Afd`ve?!7% zq!C`)aqlL`p=&}?fq<+#p@AWuKM=;vB5xalCps%2`iF;gLL_b{PJZHQ=>#MYFpyD$ zGmW$`>5=8yNyVs|TXBf*PL1a~JuAIIt~~CiB9QHWczqH55kf1kV;oml11{U9tuPBw zzi=3I3sm4RExL61gc+TQl{1vtbY zJcKVjGp*1p8nEl~m6mb&cJ#L{UkTOayKa^*9QPunCYio`-{vjf3GE42hPiy?TU-nN zzZm(;qY@#prHhtpyA^|fxSO3%=wFTJN5QUVWWroAe|#J;tNW4W zf(x@aE2ARj^zO)sXUHgTDO!rP45`9<9PXixlHV_HKH(tRqxvZAq0@oUS*qdbpM=|j;XiCzw-ay-_vfi|@vMyCWOufpHrnM$CO^s;^<=aL*U&JcG;g>(v zWloUPxA3w#+y~k7!L))He{~sbWjgzroY==v1L79MQzBnZL^)4_5@WMECrW_D1!`;^ zHwBP2o)q+Ooq9}&r|6zPM6W=ttrgsiq4J~)0^Pv*u2;L?ApC)vF!b@L#tkK62mwTw z6rjl9i@jR7PBEzE=r<(7{dywBg`8Td-Y^zt4TA_zYPHu$4^!+ee*~G%P1>~G{Bs~i zEN}Xo5qyw1yUnFE$=t=6z7Ajsn1Y zO_;>gAR7~M0^9Wb3W15Y^_~zZ*r~fSOlDq8IItv@ns96}t;XSHH@5()MPFLTc|}nkF}IKusmByX--c4 zONW0Fu6E#xI&YWbMH7ey`rG=3_I05lQqT)r?H$#-&}b(sS^-2)o`0C})}Z=fpD`<(x;)GS zz}t@?Qw6HJXrH~@Q4gW|Dk5dJ4P=K}iY^Uo(N6o(t^lWksGH-8X@mWi$UuWD(>Of+aeqQkTg0V`Kut&`>Al*>KK8_%$@K_jhzht;XpNm=e{C*F>iMwyeGZ|&Av8d4=4$5yj6NOf z<#<159~5}Ljc{{9rf(;`I@y?Zb#g1QukJd5{+M&Du9?8bO|Z{$u+ zHI_+7#q5Rp?Qr+P10%#AA?Ot#bXq_ieZzY_=}YZa&Gl2iD#gIGZ9H#vW&65cMJ(UV zf3(6v{w9R|U3*e#_(mdrwF)QsK}vF5zc|McK<+l6BHL~rK%942ti0~yua)|tN3zn9 zSf)ASMk4_r=-Vv#=Ce4I%?OtzB2l6jJ!ngxB+>b1qdT1wRP71>)_T~$pzi_g3NSY) z`E=0jJ7J9MO{N}3i7JeOJz5@Gl^he}f1KwO6xcmxfpXkBDHqPnp^;xMgpFlM>ei#C+CY_CGREudBcm&r=?UKJ^9gCj1*3;vk&*i+ zroh``Whd#;x26iF`4{8!lQlA~iZWHF07fV9cvZ(Xl_}L7kGrY;aO<`!U{mHaYuZ>vI|u%J?Tc2Y%DSI-F44 zRyybSnls=V;{(nvHCx~~*{yN@j*SFwKw3DD^lI$JN!1G!HGXqD05^>65b!OS>?U8H zOx#ekghxqizGWNnkC4_!?W)ftLIcFI8WHe+Iq@5%IqSVHjBdaB<9oOO^77=&=5trM5Rz# z&6*HzCN(6VAfH*ED*mC#eEwnxiZ?le zr+3mj*oJW0bI4vjOb$2TJdZ=Z2wBo@b?saRA9sLd*O9zDjC~27e}H*F?w%bYyYy)^ zNA2Gw@NZWyhXui@Zgr1oaNd;i#>$CPmU9v~bl)*mkgn`2hV3c8UQJf1!f;pr@U`y0 za{d0NhD--1{tL)NhCW=sPNI?4 zjH4-`iU2=Az`rU{#lhcaAb@xpF(gGa1&}hv_Ns<`0=5TsUw1ViK8-NxfWItzl0{&(~5+}tRJ#K zz?r+6tXK8D{1!iM9?bI%lU>_FI0Gmmz@pGK?IvR;5_PsM0HdRcN_nm}i&|j{wZ`WU z2$#2SFh1w8R>LxgB7Y-6`Ar5Ca}0G51+(!w&|<{N;EbjgDg&{_)#6k2DolUr65I|( z4pvNdqqR5%bo8g0vJDVMW0qlAqD+teF~d2{>pV@m3dLeCMCyH}J+K(Q8)4GV7DeYv z))VKva?&SyC>eFyJt3jIB z@u%k#^I^J9X?|4O`#n>0aHtKdjx+@*;YtOLC4ngcT_WFwEe4-U%h>y-l;&{@u3T13 zAVv!ye#ZJCwv%P6KEprNqq$#c{hFrvre-gX9{B0?45G67D=3|F{xObZ zWPo(DS_*P-X@92&7i*UR3g9Phl()fXh3=9}zE6!o`po*hxjh<9m#L3mmTUYn<+o)n z9*{C!7QbNOnT7Re3|xZ1)WhK~JA(2<_CyfB+wf?01nWa|;%YE_7ZtCZaH@EL$hE~@ zk^SE2oV7E_@;^yRDn#@<6Vl~<5S22gXM!#Po8hS%4S)QRp}xR;pB#ubc5Sm7?wm4o z1F(>!vIG%^A&irrw(o{2pkn_=^MDt&Aa?;#`8#r{pgw zqzQzPbANgIBP8!>#_)S-sbb(IxhX*TCa8NQU%=y>n6B5G6rJNf{+~Oh=Zr6O|NW4Y zHWQ^ITU&^-k`u)j8X|RQ-~()~Ngd%6l!-k_;4Ah={q(ec&dz>E?-9V2G^cL48$p{^cdL&Y;(%c4rSS2L6V7i}O09)B(+f~6#W#QV5NTBvus$LD(C3ie~% zlXxT>(tp9c>^41HMhHZU>;{ytk1w!G)1itwXok{@fDuAl|uB~xV0tQNG zWPg}v20_wx)v;p|E-yYQ%2Aw+&FbQQ8@Nb2_#v+5N_c5Ozwrc~_PH~yZt{116$^A# zLTRD=MFIuX>-Enm zj{)y|0!$Ubw-t{b;80h~N2@$VoM}m51AmD=iwLy*I+9cX@DgC#h~>x^5r`C^HKEpB z{x%c9x~x?LT4)^t*EU)b29uIPgH>s?z7Sov^>F~SO8F79Am*-Qfmx5+9yE4|f0p^@ zCjZ>xpWFO%$9087|MU@B5Nz-#c>_WuA$OY}$&T_)LjKUn3M&L;2bP67GGLe0S$~}l z6EFkx4sS>VYZsg%yN+19Qo8{oeLd+Wu)CiGM%KV735p&ZM|lb;Gs0le14Izp>j{k+ zv1N9d{9EJ-07w*H09M*YN5B~k^dTVlIJi$wEEZ~sFw;H&9;am9WBIxL52R#Q@|TOv zM+&?Kxrn^32nH7H`f$8e$A6L9bblU1QXlhre>T_NgFaDO;htvaa_{)N8(`eVQeu9% zP;I?sHn^sTXe0vOHaztN>@+GNwP8h~;!%M3ucH$8-GV$ZG=)a#{>*mra3Jm+sRYX( z=F{h_Df-XMf31?ReG2 zp~D_Kc)+8I{iv9lGT1d4G{kQsWs#th5y{o(6?hk!Kmh5mkWa&*>q`aPV$V{ig1Pyf zupZn{v9~cK$An@a%GvenYn#yLIR+hqWq%bNROvG%WopY1sV$>NIeA4)N(uCmx<-p& zUUldwp?XK?Q}hoJy)sj#oqtT3o}V%uJ7r>qI%SZJOqqr;0$fe%ySf=?e&N5!-}4n< za02xO%ON1jyU#c>7{xQ`ZnK1)>u84njH5IU$S7CB)Y(@{L)m{`B zZK`iKWCMLs>=VjefhII+&L10=yvxNYv9wm{cS)3rdJxn^9IQR~v46U=^mu=;eB;Kd z{9WVU<&}APNYI@-@^a_i0Rj00KAlp`{z8TZP&LjM?7w=FAfi@iAqx6ZopT8K8A=)I zhBCDG>JZHsAZoq`{AeB}mv0ctz$|_!KkU^pjwznBi&}$_Lpp#N5nbCc1{J>7JYxsz z_z1q{641auO^Q^4$baFW6=Dqpa7Cg*3C16}jQKdHxjvUn4sNxBS~DIz&eiP$)!WxaI~x2T50aV?%pOcG8;34$hu~_r?~K` z)qtXd5q_8imVY_LEa=bZAX1!`D($s}S!pMq=pD!!^7zkL3D z^~LMC!r~(Mo@)hQ>8jxs@bBPH>|?840g5pg+!56^!SOVjN5`BefHtP5!u@^tqyu~t zT`{iA?;alP@1OlxA);=zFjt#ZjgaD;_}3f)@Z8~w*ME;8@+hULlCOmad%r(;cu;~L zy%OEt0Yy%ad%Dyfw7# zhq=Ms+HhxxU9j+j2+QH-vCP|-kHAAf)?f)^ha4|FT6z5Q_U6j=)5F#6$1AT^aqe|F z6=F_5pnthYiXQs9cVqtE%KTb={=>ny<EPSV(Qto%8{SU~!v!>! z^(g#+e)aWOK#PtSR$n|h++KOUdbqNE_-J+g*^4RfSuf9BxZjq=i+ zQE`86QLIO^?%v>*0#2;WX;br zzhZ~@#KK4rr#n4;h_Q`r;^1&Q2ofg(%K`~d43b(GV?gd_e%5@SZ}Y3>eXDBSwze=% zr_Y=@M~{v6T5IpU_PVdCT2*hcl!ws%Jb!P!C%D5hU?y?s6b+S&77yoa6n=Rycp!dR z7Y>eW#NAseziVAx!r2e(dn~j(*FMQamW87}grE#~l8MDxD}!H%)=75R=yxz*Gw85E zG(&B(tOvZ(i$+%8(QEsJ&$UnFanmqEgHBny(i%`D4PNRXEFNLCn^+{U%oQyimVe)) z#RpFwF9E*<**vd5+t_~oW8K!!`sVfvE(pDJsZtJHUr(36^TqWSd~bIxEg;Xif+YmOMq3#Jo!VTlM-ujzx^%wjdb|KKMAM*?3Z(SVBBu|H29O8x# z$1Ra1aBV05-yUj)2z1GIJHI-zeSh)-Du7p3nAdK-aoLbO5wuNJ!dLCqIWwZmWEO$E zgMLc3C6sM296$Wp#nBq}6vFV8~E?OesY}pT+o;B7yAVZ@;Z_hf`pu(j5@nzJE=-48E*z z+mt_aoA95(z{s>^Ad7$)N6rQiZo#1er97_LVfm-O+ItjB^&lgH>;k(xdP#!%qfY>X zx8GO6O|Rd6uTmcZx{9om;n5N5Mp;qoniEKT5lh&}52 zaDdYg8DTgEgv_Xp4hO*z;(y_7-Sim{B9VbB96`ao2C`t8VgVI|hjO78tvWiiy?vWV4=2r76Gi_O@4@jXSkOFtTKlo7#zDi{7YR1CW3$Z#(xL@DxqcJw)F22 zMs5G9J^);A58n})UV8{5K{pk_r%J9(u{EvBp09!@aV(~v8bOF9?)$8Bqj>0qT%y)FBx>AfOop&x z$wa%v4rAZBbi1z$X{`=n_J&b^r1Pe6Bs<`7S1vp*k$sN{5;BbFnL_qBz@Y1eqGldl zbh&EKU7uX~GJlojSs2UlqnagtWs8G5D%);!Z^HU7mk*e*9q|~ax)$6+WFknpFy(c_ zz*eBU2dEnfVN`X7T%b;RaC9YmN=qh9-!g!)Vn^=#`OK+e^>xQ z1P%1q(FoW??>5<3M5X7HmHopvFj`}vwL$*E- z{Fj|qrl1aa{IX8i<}c`hUgmfgt&DP?U+MG$YM>2|A9=fgUv4Zueb-r&Xn;l9OJH=Si=ZmT>wW5vuF51WGd%-$@-XQDm&;*jomYWvdf^%-@K4c4KJf~6wR4OI zI7*0^nOqcV@r6C;g*gzNG$0o`IC$_CSe?TKNPiZtkOiyoOdf$)TMAUW`EKP9h5I1P zA56-4Cn}*6o-Ebg99$j@km2&9$BTGlf`3Wu#IGf#wFf2&5VGjy*kjjj%gZ^g)8CNx z9Q+JtfIFg9lI91MUdoi>VGJL1$7gtrog6hCU3gqzDf>i6#W{t~OdW!>Gw2o$u}^2J z17P~_mb?!kNW5(h{Cen6MNE_M`ND!Ys{_NyZyd{>&9u2Q{q4z_Fju&<4?@i1H-8!4 zUX1n#GEU!u@Ngzs-V`h&%X4lqN`wlcqsp=V4CA=GUw%=p{py7{l-~#uJSO=%77}rs zQjTM8j$@(I#!BvtK`M>0e~pzWA3I7(1H74#T2YJ-O;T9-B)%i6Eo0eX$Bqgf+Y&Q& zbssxYT?0fBp{U^z7Q07IBKdf(aDN4G{f@46jS)*MC=3p-;Sn9rHLSgmJCOsYWNxs4 zU+EjPkI0N6GK=G=8H&mE7tiwda$)H(^d`V&mgnpa@mBWg5UhP{e9ctL55Y3Gr(Cx1ZD@VD!vTAQYbt$At}7z z4L;?H{L850-$vM{ukWmSt$z|cAAeyJXMoJoHCt421nH~~O|OqWQ*o13MVh~5^vF)X z3ooS1o;A=aNWgzR#OD88|)z^tc+Lr-kM)6J9+33|{VE$sR>B5c~Kutj7ai z2QFnv#qosnF@GdcB6Iam*jFIfOQx1Z0z4aQ$`@z=sp;*DK&9LNRc$D`FPh=*;e zlN;P>4Y6CiQ|&xo9~pw$m+o6K`l`@G5-4QN8kY1Br84LCU{!Xg7g`ay`vRO0WK)ur zj`!}Vh$T8*g1zfhU|TkupeIo+C)d4z4A@V4j7TiP?SB)DMZSxTQWj!sA=e@nr2wB> zjtSTy2jjZC-T~_@(lMC(hfyZWNjkHm*3en0!(4O=fhisyc0FU?m7es^Vj$A=i+hLr z>JA;ysk}nhA7x(gqdSWZrcy{;`h_&SZ4s*qfYKH>>(xz&qubf^=5k14OM9t|T(U-4x(6~Zd6 z^5HOqxO@2B3WESES%sE)dZGWM%VPI3>4`qxn-@^7VIsz52soQ0< zbu>ENG3nGr<2Vo$GLBwaZ(!+Zb}y0x0AVnuYP;#7TxS=c(;={^%AcYin3vCP`}7a@$~Jy`DOYww^_6c1M=xxiyR z=d|XE*MqWkYkgN!U8$WjPS#0HUTOog$|!i*SEgZc()}KfmD9~NBIO)A%C}J zO1B3X6mqI0iR*m7H-aEYLeWCPId{%9If0Wx_u3uT`K{sl<{AlA+k&(=)bXR+9&7*` z=`rfq+N?EuVXN)4P3Da5vi9W4?T8hblG-2EbvTry;f-_*%C|@Qnr-bnz{=O@(q7t{ zT!xSmWhxNW-cWMHH>$j~r`s&%&VN7_TJ;nlD(AL-qbyaOzwN$i$)nAT1aqCBs}bka zt{tXu=i0Kq)J{dq=o}HVDt_w%C6<#ij~#WBERxF0BPZG;JT#W2L64AiYd8Y|Y-^b$N@iI1WJIWpeHW!7W29x%lXi55 z7isIsjEd}4)URhwdQfrE;Q9q&w>G^FNR|DA(F(HZ7wa$A_f|LRn`_nT>dQ5fY}9)=TntTdNz}FMp#acGq^cclYY{YDmxrISjUT_V?;L5PR=)uvga*S>F=j zmfYQ2+r^{b{PZo28>+ul_trKcC&6Rn{?}gApY5-2>`529XCMAV0|`93_8J9S-Q|<+ z9+}hEKKNqo`_=so-rwHeLor-&C=XbF`2)YO{qr2;G_4g;38azlqJI<4I7M0WXjCH@ zp!&E*NL^3EDj2{~(tLK`V)}?h_>>ct*kdlL&$$>MRJIoyFB*W;WQzS(=2rX7QM-dF z-=ZIUO6qXr#{{54EaH#~L?rf}6Ieu0{uilcHmB%1iou2#Y~DM#aiYzSu}IitSzcOO ziEp+jy! zcQ_9{UEnRkWwHZKO!SgQN8C9>ePU7VCsY;(Xcngf6;wc>_vA!#r8nb)0cCApEV#Bz8~NDYWq<9~A8qP?guN$D+Iq?QF# z=$sm8PB&}L^1$`B27v6jjc#lU-~bg8Ua-`WAnW6+PBqDpGtyl>yUj|ZVe>0e|V zs&c-2o*a~-9!mSjKh<3LlKOg7>4S)KQX`YPYg_x9&(?NTR*vyVP-Cp5vPB#<;) zdS)%Z@8ox=k$G+>M{4(E&2AJ(3&ZoNczh%UzR+tR(8p6QB@F$U+~M)LlLFeQTS z*==Vy5Y4)AEr(zHkz+8-xT)tT@qpZ0-`iNz8v3U3SyEv&Jz1eXj;LQjq{vm?>2^W6 zP<9#9dhn`SPOXPZEOK1VmsC$o_aM`4YeCIOwi;46a(@i(>d8fdd!K6GnA(w^a8fnF zje^p_64fCW>lL13VL()=Iph@ROq<7X*{1MDLxDt-KsgrdVi4VPZ#&ce>s>m^v)d6l zRM68Rr6)VLP}Z!vXjoCJ9hJUNRtvgC*ccDXyQ)SgN`&Zt*=M;rs~QBTQz3{r>y=L9 zL8;PYt$*V)wiz_4lud!~Nu-vt-#SGuu#L)GVtWQjV9GRlglrTBFLFe|vUf*)ksb|3 z(jiVG1s78v5pWnOzm}TE&!_?2-hs_v>4oYSxqtwMSp#ltO{uiiHV5{8$=FcK#3Ysu7J%t zlp(+X2X6}}IeKb;BEygsbc$qdOd9#E^+Z5ud8pQz+=KFjQ|ZayxFibxosE`SkS(Tmj>v$)w#L%>1g8D_7Hs#7t;MR!RD z9-pbE4<2+@viyQs$byf+NR`f4ek-QgFMr;hv`;whFTlVG+xL!=69#C%N}Uy)Pl|M! z42L7}2hie@UVvy?H8`YIur86_V6orpgGb*0OG7t`)}N>Z#lT-^Zmaqx$M5t$r@QPP zP>YpxnZj%s>B^G}8G2peBg|5n+Xc(n{8qE*u&c3c>9bN9NV}BF9*T59OWt^>&wnHI z%{PAz;cx%q!+-i~NdNYsRRMgosNm@G=nAU$z~ITRtnR8|V$VTB;VMBTBhj`LBWP(b zwub44Qp7K+xD*p;zuBE8?TP)`$8IsD+7l?Hs!YT!2|NuD=39P@U&Bh?IBbHum4FQNp=g># zTZFFCdJ}7_Dal&i5Jc(*NnRZ zAYq`115o-hFz^*fjl@(0rTqWKVCtV@CN*isQ)n^eY1bpO29O@P`BV8XAJG-m+L+l? z{h>BhzP(FdAHmS)K8QrB_&DOR@?;*jY6nf+D8V3!0kNji;KF>V1YoO;jWK@U#(lg` z8zWw#x_|cJH7DE&+Kxs-RKU0&$JJzNF-l@W{Kx~^OVP5Zw{uvl zESNde0s*Y}aV^y3r@`kiBV6ng2TjQt7nfmcO*7;(PpGZ-AKeCt*|_U}U8AJy72m)h z%GLD942g1fL$UpiH`XKj<%wXlsuK7ToB!J483eY!A65yFqg3M)Tz)AThi3`QZtw$ z`!48r`TFEu&l%AKC0f&Pa!i9aNAXrAk)?@}{LUuDOjJyF+ALJ+&(!**j#F1Xg{kux zRZU6iJnC^q9rYKjV}A&m&xP1@5g|BTKy{qIOjd<*drJ*Bvcwc7OAJ^tXZM5k3&&|#s((PUh`J%N!1-gQnuZxT z@C;-Uu*};1Ow|oDr#7eoK9MLGMzQpse{WLV5UXUY@`%rxQGh{g1(TIb3^+2cmX7!! zqZtQaxX>D6s3E_Z+G?P^+UIDw!5M!tib(L&L&M`eF|){MWRaQLI<8Rl&uCP^A3i9G z`@sLAstcM_Y=0=^3JVHc#eM=;(FDL%Y$tFP)p%SbUtUQi;wm;1xQfC&u4*rlnJzu9 zdV|&znOIC@VkyC2G>cTp%n&j)aFEMIPcFSHUZihe*Z_36OSUZQWws2w4l=D0trqx` zIlG0-)blf*r(|lez!3#LPcHbD+9JpW-!eM{USWm63x9b>z3Nl6Kj4+t2fVsW>@EDy zXlTHn%$W{kW-yRBa{=zOXyTl$7lx5HF8WB@o=>X4oFQq3ZHq4BlwRWWC28b_ zA;uL)Vt!nO9K@75$wGLsAooX;0Txe$-=Nv42cSv@-TpGzcHR7zM{&6$-u+?*b~0nk zXP51@0xFia0uF!VKRwvVSt1Sh%$#AI7@dF}foJR{OQ5l5%*X|%(?ke`oE?N%y3^^^ z9Oz(owof_E&r`W0d(mEiaxou(c@IkI8Tn#9-=EbBKd+2Vd>`J8;A*q!@r;Qh@^faEVTF-1;c%+RC^Czj6$@fQnb zY^DCRgeEdwI98|~NNo}!jIfb$cf?{Du(6J)b)%s`eaA~&OvK`(C6Wu~h`B3_#zK*l zL}V2c!V-VjMX|$0$J^OZm_%{_KS>~vnGY-sL=whQY2gtsxV&*?<*!kjd7tf=XfBSWh~{I1H1KNgsq> z5hEDK;Ey^m5us#37+u&ojL9Zp={3#$SZ^QnfH!|dr7Kh~Ef=FKZaQ#c>`7$!eK88V zLkqCmO@-Vs+hGX@8#rgUoPR(mH;lUObm+s4&8uTZrxv2QDEgSDkV!R6uafB^P|AeP zBULZQqx13phF8DL<%HLda{8BedCcTbe!%NSx|AoA=N7k`tj*1V@Z=%M7jUb;I6mmi zKX`wFw1bB+5%(~EPL^hJ|7moBI6-6HpPRc+%IDXpuP(|ro=D{hD_jBJX3rQ~XZj7b z!Q+BN!yuPsYM7R*_rX0g5R6Yf`C`#Y$_+{oaUUO14j!gV*b-i$ny+vF>eJ}6cl}b_ zkmVk}aUIqBbz1M=mnjN9|DE^_mv>icuc3bve}O8l!fB=1KkLLgU;MZI?Y*@mtvT&P zsC*S=c`a)8pOIx`lv+eC=eqmQ3^rkL7bfxuE}-vT5wt8ciEL&*3Z&n3-32IFanHj- zgm-t*D}necJ(^37uP}!P56b*W)8`*NhL-ce0VePvj^Zg(uo66tA55lnj!pP7;Szs@ zg7`A#1K^$BH5Hk&_*6eqf^b?8QyIQ^TsOPNuxTq=GbN)U{%`K!!=IjiQK^h8mB-Nv z_-F~0wwA^+L0t=W_{RFyTH4>!hQX9G**Ua|WQ#ieF$b|cI|2=jhb?3$vqCn|*a#-c z8yLCyL87WH5p5OokTF9a^X-G>@~wYbGqQWC)(nSVx%yeH=J}OKY7z*$yOl2~UiTmU zyrVLH*S9~v^3w**8$D&N!FZhp_%OS7VwOY%1Ipmg-!TD9@kE(wmCh}(NOI*oO^EWKgBJ-+w8I&`lDXC=ejzXh}E}mB7V(AYj{nVlMnIrQR49|2UJAxzFmB*!<^-V&G ztZwZs-zf4zI%2=jqj#PpeJm$gqL~X<%pcZXKi}Ti-u3x80@x|Dlu8CX0AiFjUw{5M zKTgxW`8coXFyKKuf4{%=d~bh!dka7))t%M7=RW}KA+)>~D7g>7dLJfz#D^%cmsa<% z1wY&8Uc9!k_Iz(!0S$cdl(p#gE3gj?r97y)4vba@F81+Ix#OVSIcx24ggSe>r}CHhzy|2K+RTd@;Vb7$epjj(f-B-iLp2vvoSgnN)Mk zKMwBkV0*a6I)x z3!uwjJqefu?5e-6kv|oA2wdz5M&MMpzYPO2rTur23*aLMO@{NJ_rmH%PJs;!PT`o; z=2=v4h`Q7kPPTa>C<}k$7C5*G7F1Y(5}r5husrb-?7aua4ugM>zhHG0OiR_acu^KP z2wCp7VH#rW&?x>$kK2!=?5SsA=T8`3>L=U^J4u%a)*y6r-&Oge^EnJ-B&`lLSzO}| z8hYa8-46$*+Cw40VA+tYRD)s!EC!Kd;Pg@syi`gzw2qQ$MeMaH29J{Su`*y(ZLfF=@H0jstGRyw5tvD;-C-b`f`aq=cTP8?;(Ti&;#*Iu+wPGEp6P^2m`VHyF+j>TQ& zrm9{L15r<(TQ7fsU^f7<3UD_4hJ&Fy0VZYNg}QyHb>LW6L};z;lM@?@E#SKy1A3vK zd~wf2GOfK){|M?%ibz0evC)5td2aB+3^mT91jD}aKw^MuZTCdD=a9Hei^?qgd)@P^ zQyo1Bc>^~Y8wSdy2*ix+=sQCjD-``fhem;l6x*vOkAQzYL(kG<^coGMd*Ii$WR2)t zRn92H6A9K;MZm%|2K0%7b<_ZfX0qAALPhJ3;m+DtCzHuCZ)@KQv0xf`O*MMS&Kv5GxooMkMl z-j}<{#|5J>f_}qJN@wM^m^Bg(1_3=xA{u#0q$PLeQEXfF|z0?=mY4w-lu zM23>6Vs9kNB`U5<QQe8bdljzs5DBNm}8B*u5)ZSri#cvJDC8{qQ9wg%}0q{$+_o z_ac9tuaRWo>PO%W#nL=(VWq?FSixC~fR#`RU-4pigaiDb7x)U?t>7w4)r3%zeFrP3 zBJwd1dV?Slm{G1x{grAJ3g;qX;DXV)DWjAAsSS$N2Gx-`w3abxznD|x$NfNpxZ6ks zjp}R$?Zv&aA{>3ZjZxH{wus+)Z91NJk)eM`xqDoxJExXOL8)KroPr=<`cb|lD#|(v zWeRB=^_Qa&jp;$6M!0`fcvNI1R}>=9f;bbam>OrJgPp?4=fn7hZkzdZhvOKf2oUnb zb0b-y(?p^-x}`XziV)sD3Z=;8UcQVKS^5YF!LV$$Ws^(q$*Mc;X1*kvnkx#vdEQVGGo-|&U~}>>{}F9}Hbef7AEi;Wg55?b z<|H+Yu{c#6*iR%nRwqf#)y9)(NjzUAOHD>wNSpq2zD1)uh2_YPxHxsnR|6w8-F(sFr8S}v!2m%H6Rj^^{$#9Zc@n9H5wW&?({1gUw^D0J;rG!Ap0 zQr0R&^5tVv;aua5;)%vPlV8bKCq(_M|4@+)-NI0;fg2lHgPjJ5Bu85@<1|)@f)yu0 zU;?{a3-$q90(1pHYsUAL$FYA1wfijy4P*d~yGZ-mnw_Pncp6J$p1jg1gdr*tGEod( zwkRaDzG)rqwC!%Z;Oo*F67H)hP|7f>`HkpNsNiTa)pli7is4-%^eNA+aWExldaX^4 zIYV@G5vUW38Wgp~P8AALBcYHk7U~zJxq%R;xvsY~|9qzbqjK}&_%i?o}ORzAZ!RCGl&HH?Q1RqqHZ>Eo1!<4 zj3A%mrDAet4a0v-k6*J@Fr?Wa4hfhW{t0~{jTucBr#Ko+(yR^)>NIn~_(evOw<6=r zBrcWkh>E4r$Zc{#Xo&{II>daSb_*$-9#i;KLBN%Jl9{B7gBAt$ zG-^8$X*yavY>6%CNR#kIMn zc~U-(CU514Y+xgMl7U0EeS(GE??o=fNmpFl zK8#{0ox^(C@Gq;QL62(kWMMTuL0k*anxEVNph{^5*;mS@nD`E$J2~* z?QN;nUKW3IjDSUz=5~>O+rH?Ii5LjqVB|tp2#Z8t50c#_ zB&1wt;3RrhG(?=CRBk8QpA0dCF?Td#XvKUS zG3d^P7`7qBpc_vND*&>hvcslBO%e25736y`Cy3j#>kjHPP*GuNh=V4AiiC8rr5JnS zV)?AFikrAH&{}RN@{ACNH!53q4sSH%R}B{r1+xU#K`!nZF7jRFPohFCrXE%ad*v64 z$uoZ}pJG&{kBeovi%9JdQL>RC^%n=~`>gREhCIcoNpbS`k|N}9y07QcRrS1SO7Ir) zpGo4jprmPTDexZX2GRiIO~56Qy}p z8@5uUv~9GB=QVeY@?tDxgQlchTaWUhSXh68<3~|a^XjEgEGBoLo-cu&6cL>FtvZQ zFxk?Cd=nH+y~10$;!RA%TzbA43a4f9y(wQxTOvPKbID0t3_6XTA4+kWscXbZwnJM| zbP$zGsWjkFt{~Z@Dw+(?62EbTlv5~K=~<*ta+?Ljc|D^kD@3xu39g(1YkC;e5+!M* z3ya2M1_8^2(6w>}cgkR2t*0W%sYHLmKIr}EX`n(cLf&ahNb*~srLo@T7$hW^g_5O! z<-sOozCSE2|6Sh*&~T!#jR%&1C0j&nHGo< zH*zF7h;S!Ud~@Z~ZDiz=2k(TWEK0@@XRl1$GTI@Z4)MwHI|zowx{de1ib8)4Lap86 zcv?iC8m*Y=jpTzQ-DKyGm@!r^nsTC=6et>r$Ky^Cyii(WiJ(Z5)K#YH`ALEzHzagb zE4qf#8K(Fm`bfMt zMXB!KK28c}^ie7V3y~{?l|tETJZc|J@B&bc z_DnlbV@?IZ#neNNL@0-pDWy?pAVTt{_MVW>I3w6779X-=ky0^`TdjXCDLVFeIaDb3 zLIsHg34}y#+)@LFJ!qqrqfA^Ac|Pj8i(Pe659K>=^8I=W0P2ysd~{}EmC|MF=v^E3 z_uXsiC3oj5G=dJBEe$ntxaHxPc;AY_ZkK8+jx7nW zlz=86khqC#FoA_I`Lx$Nr=?R2hR|9}wCc+pIHAm`kf%V3E+Yy(oRWQ_p$)VCspcL2 z?HGWU7AT7V3`t*WEl$y;FU}AnRXnRr8gVVdgyiS7;P#I|w84L5Yq_)xleM{p&HNMe zB`Pi9bk&pZQdhkV&G7`k7TLT^voQxW31|rV_q&fbiPcZO6IzSTt^q-BGf~hpP<0J3 zAEiK%pd7C)$=1h7|#5 zB0=4Gx&(w|+9iL%WCw;xanXb6Ut&%c70+WwXvTxcRCMska!zTc;LM3jfDCYN)m`=j zVR_4SJuHQaFih=bEJVwxAJ8zCK5BldA`2qFZib63v~{arT6**JX|_;-%bR~P{<4LG1s{C+6d)jP4j%9y zZulo8;Q%gLako;bM7I#D+*?8mCwSiGytT+Vtt-SjcbVmkIm(8EzzPh1V1|uP0J~WW zf`B}7uwY3bAx^SN1rTaBE`f#{jI;R0!*4$#Dc3{f$r9h8$kC(erjH8BY4Rw1^AH#*f*O*zQLcLk1PI+AJt z<4{;ZLu*tw<9hYz7~WE(&}og?ba>}4IouaT!&KjY8djGnPE}TpqQ$q0gycaKkB{IY z{#S672XK|!0ocVJ_>A_z6A zdy3e=9DSP4^BD6}@Z`uo%L$MQdl4)ZLbx6g>R4XW^F2 z@>I-?&-qMv6?KSS0ugnWyuzNpw7imA-Ey|Skyiy-Ud&b*7PbcmH{*N3nt8%zIYm#o ztKBSUgFqJ)4DO}c-2Fo?v++Xv(-e+msaAhin)t9*7ld~SKUm7mEW&#J8-Nv~>Ge0R z07dLUfrE^P1!w>@!rF=z9l@%d>=+m%YJj7Rvo52~NKn+wxcr!skI3EQCEMXFQv@x# zao{daO%KYDvvB{GGB zpvzW@Pl>s(3{1*&g8A`U1E28-BeuLGUM7~}VY@M9*cPpnF;P4TX<6co2UHJSHy$q3 z@mmLd92v)ZGhN5I#EsBX-$J(iI@hJV;`rz>{jzNtd<#65ikbK^0J7@G2shcODwuFqSxML1(AvOf<+?!b{OI2?bVJQ^ry zI;@*m<#LSwXl04uFmvPD8$gIY07AsY%F_MX8-P9CUjk~9r^dxIEs>J^*#5PwuriPZ z2(S$Bb$}-`H!n{>8`nx8?mSeaH|D>`{yHg&zSNbQe&tx5VYLY)iPlx&k@^zC=GK8k_f4i)kfj+xrE1auiR9+q4425FSFR#sLj4*_q6H5R%zJn=3KvA{ldPgWiRt=PZ6K4rn5tNri)S@}Qz z%N#h}J4~bE3U>~cn~N}6ZogO@)Cker{!-1n=X(g zIqwk>oMZr0RAc=Bf=MP)3+;JL=gvXBpq_|!u?LEb0U15>2^B;oX`Js5gFSR^LzF~u zoUzL3O~>27EHPh2<30qP-7|z|;oz+2cYgM5AYABVK=7@NpDbvH(vqw2pT} z5;^uTUt$hc(*sI6M=dx=!3m@&b}C~oh=nn!=4dn{*_2ds%(cO&lE-d|sKB62Q=rrw z5USy;CX6f~9Tb?HxRnzww-Xc?V1>1cpx_8iT4S1}_6>HvXfj@C$Z20g*0fmCFT)(54F99k!Wk zTHCM>7f1oXMUv&Bb0zx?H4yHX&Yu>eX7ZCMeCV6cg9GxP0?FcVqZs;Jnu%Y%6Y5?H zP^=ilo~Ide?gQ}#>O)N=QH=iOHwH1Y~=uFOxMBP;KQ5(KLE+vNaU%d*&KmK2BV^Q#l}0O1DzF(|fT z0Q({&tr|BNaWN=xUvv1#{??1N-Rkr0-L<+04(C)(0~XQkZUpD7d?B`1$8@Wx{-$6Q+Hn#8%D?M zvaxvStlK=(U9)hfG&Ed?QEaO39Joey4gA!ot_%5OA1FE zhzUmz80c`~BMi%cM@T*_U$W(jL>Lr>1gMa$#uk5m*E@*w5Fd~;mw3;E(|jMAkbcE= z|ExM3A>jY8(l2yt-!c+??pga(J&#wN(0R9Yx+ZQ~R!uF*e19j=zx>%n@%CR`c%Sm> zB=Y=QIl<0o~3^s2he2H)`>oy7aR;dT3P)a z9XsicZQI%3IM-)i>@n8Wtoi{}W6m`nRF7MIBK;SbS(%8e1GK7!{$IY1jRFYP*jwBB zt2@SC$=g9!5oWbSktplIP?1$YB*!mTHDGVIRV4_P)JpMrE%=&;ahWgyNu$Ocyhs}M zloFNDM!oM>t8x|h#XwDnIui1W;~j{2@DH_No^UASNn*3l(|plIaSjK>rj5j)oZ$?t z-^#A)UT_Cd{Fe=uRVPXm6(XvLbCIM2=tEHW{@Q{g&)p8;({fXi4`w9-s=0&UsenIj znuEBO$Lq+={>Ox=&(RoWo+bSUYmFabDTc0Y3K4Il*wZ9RMlVYK9=2uVI#nt0BvaVE zVFFEYj*TU_E+r8bR(sbE53gqNX6=eBez+iX=ITdt7$xdAZEyKhbSWB4{sZ@N>6<-! z0iwzC%K;)=-uhN2+mJ)kdZ*5*Goa8)rKMStn~s+-uh3gJ^zIdft68;$7`e|TKPBOJ z5N8H{JO@PJ)HFOl7;&oKUOHjENL2adtxCQp|MRj|spe9FkZ1w(Ods1fq2|4jy`9Th0M(D$-dHlV4K@_2rV z8yFU3J4O6n%tVOo6Px=Fr+l)de=Dah6P`W6G;cNDFrkTRvo;tzE)( zZ*$2V&Wva{B;H)}t|48=4JeC}p-oA?F@}?cBOdBVZ`S1gp!JPu!@5vZO!9&^;@j#8 zzo!H~9#}+;u0|EjBX+KO+g?Kc#9X1UVuIb-s*YvCd=hU>T!E1G#O&GrP0-DI2jQU% zquSZ|ERTZ@21e5p82TV!JijS&6h7H{$!9JdFGB0}x52oH3D5j#0boF|PhWuCaYI|7 z@)wx>xe=`TckW+%M_RhTbf{|9!_zLOJ=n24m#Dm$?uQtf_y;}>m;|AAuy6^eC|`AR z10~hup;^&*E%gFlmpPsgy5^6>x*;B>oj3Rn{DJho;ybffxPs^U$yHFKSE8i=66fzn zW~kR~w|9R>4CMn$fHbd>lG~Z-YG%kTz+Srk2XB!sl3`}}ey0|C@3Jg}H=_lC!|O)1 z7q5noQ}*yL)_pz`KB@do=s4%S$Qc=lgC<4xIVV==bw>S8Y;{*h?xA9wYe(e4!kKP@ zxhYf5=z@LDL@)`AudQ$CE1p%Dor)j=3d7zm-(XAXy$t&W;9OT3jF0;V`oMn$o1hYv z*t}k(A4c*cp*Q{0T!-=VmylbEd9yO#3K z3DgupGj98dau7w@-tIAlRvOj)S+t-+&su0%E(&)mZuSL`KcxDRV!JwH=M~B zS>?F=vE%j)VMPA66$l+>C7RAog^}oVt~Ke2hczx`J6NNvhop3Tc`u%=@-UhC@c@#8 zzMWp7pSvv1l}}~t$O2zMa{~k2P)9BUESvBcFa#2jST)cY?VA;>3*WB&Tu^Mu^d&!0 zxTQ|a0QC};Jy?>ZHKtM!=V?C0@vTH+vrH9s?hqfylon2mZN|MQ;$v}$I_5UY}hK?V4D&G7^xR;hU5 zotXN?{-eYVF&e<mtZ<~Aj;M=t7D|aU7UFnO z08RGKw4Zff9IKk4;pr9jetS3K;t#kRuN5%_#SwiN+vNn@`~>UVnfaX&pS=<)9#qVv z9V#B{2D$BBSr4YG9p*y8=hYQH*sn~hLqMNN(3zj}5jO)Z0UrNjGGrFq!KS}>(}B{W zy}s6@k5~fUq2=F^B8oE2pzmFAcvr4Odc{ZUfn00UDi!4f-jn21ZxUae<`*>d-0)~a zu`n2O{_?+`_o!!)=1v$G3TN8IEY{$>5A;iYCDIEk!|EN=@$HIcgGuY^;!i{lQ~(Lt z-se80vpEhPY7}*-oQEFwkao*`)6787=<9$8W{QwT`AU>}`AYZMQdWgx&v(k%Qdi3? zdJ~JD?8FM%GR-KO3_fv}#5B#!;Y^j<=}@SbzpUGtUkUF>L!}~1MunmR-Pg5huF2JR zT$LKt>A{1+Vh)(<3XT~!%Yb1%0B|?_5w5TxQD3IeDT5KfJwn_piDX5pE2~Vd%tNA= z!5k+A?*&z)Ww(t}V&gFhZEBf^#CEw|Ub(R7>@?+&g3+NX3}91fn>z2Xwdlkx(@SYN z^=A;fWQDfOn8$1|QPW;T=9h2HZ+Vg9Q7N zTAfN>12NOUIXWSy;*@I|3M>Yf%pL;4DZtb6kv7yh+JarUH=yCbZEeRX{}d%w>KSql zK@WDAS$R2$;>V!mi$qgbtAo!%jzID5YvfY4YKM|# zR29xUDsNUkv{WsBy{cCmOFLNh*bCGm(z_U-@TlA!_TxU!tcrq)1u#q=2En_X+e`yB z3Gu0-jh#1w?2U;TcSTItWWZfVB9{XG(JamO5^&_^>rry&Eq1g5Wu*tdbRp| zZ1^}&(c=ze{(UsY2qaFL3?308=Wp+J^A_2KeP>i{Z9d*cMHaxLs4QWH^Ud&O$^B!m zPDL8K`H_B%6aC8MoR;?=3sP6@9~`Qog0ESYu6c@#5u!7v_brZ&w^=ZV4Up#P+5}p`&pmn(CvxQJE zsNl!li>QsbDadH|MA)>3*kp{#;LS+7__Z@E-dP@dds3jD*B8hmyF@Zie-mEx2Q4@1 zN=|EM$@Dzk5TK8R`lN^$OVcH6-*Hg{^cQ42e!7X|E6!O z_ULJ<_6SgI%bH#OPY&W}Mg-e?|L@kfs6B_Fs({y#rpuA$y-`|?=WvJ)nb-E*LU^uhA=jp=GSm;Z7#i>X4Zo~ZF`c*0Hb5;xZ z{C1h1!X9-qXwFTF#Q_2UIt~}1hG;}TA9_TJkiTIWFkp5=*H#el`8mEZcr{3{;l2Zh z-lg*3{i@;BM-r{?^Q=+A(i5*0i0vQk(5HB?e*He&MjVsf4NFLGh!6;xdgpz(X(a2E zm{sb&{BqG$&;+XPL@bOsjfSv&1WLxbo4-h^S%O5O!|Bk*{VxyrMM}>3Mfjp4FEH;r zY}t)GVK`Z=Y9`42{6MqTtMC8v==&wPG)B#A4>jWNy0N~N-4>3V*F93J=VvVo)sWr9 zy62q@`JdD3Yj#vC$IV-5GFDD$i;@Fr2Cc-own!r;sNGmr&P<(=O`%$nkC?PW)}=NeZ? zy&!Kx1?vckwkI|l(Q7#wdsBn_7uNNMNB6s(Jgi4T`W4UfPYm+24T^Ki!KjnpdK%kr z3ZJad^m;p+&(|j!Ih3#7N$V$r@2pyWqQG4wB&8&2hMuPZvwHcfyqgQDf#hxFQvF#$ z4c*VZd(SIivICL?baWgk^ydbSQa8EIv>(UV!|t`qF9I*t(YV}aGv9Kq<;mOg@wR9L za`^WqY)NwyWzB{&@oDvB;4`d{g5cTTk)onp?hBxzG@Aovr=?THS}- z%j>WN`?l8*Z#_LPU*J?l2Fj}$7UCfN8P+%iRap-RY zoVmhNSF_dq88~#ubc7XlQNk*}GR)4sF--UuLMrXAu(bR}!1QrWoI?!SD!D+;MLQPmZQWWeD(hW#^l!KrzOBew30Zt`AE20_5D4u&Ki~+V-0xtM!25a+dlb3uoDpM(*teO= z4Xx68#o?#uK2dE1$gZp-sYztzF8F+KC}~8bFV59=M-%lV&4&CI&Yb`1@A9*odIsvDDXh+5J1q$^UOiQdBi+GZ@&f^gRyGzl-_*N z9np@d4b$(zN~kA0LWHRMhw9nb^^WoPs+b)j&0oCF!5dV5 zsuiLCm9WBUh4a%*&X!DNA_*ENk7*SVEB61%;wZ&fi_C0IwiR1A+b=$j{1>=?O8lwf z*jh+f11p62e7LBz`L1SV&OJ(_gDl z)hv;{Zd~QEYOi$xyLZ&&vVMx@0mywO5j9i}f@q0Ui z)L%d&LrgP3qo%CbkH6QvLHwOk?#|hQ>JkIVPs(aTm+?Vkf1|YZ4Eg|uyh=dRWR9a0 zsFs~$kcU?>2r$PQG$rqsDcEIhn790)q_IcUm9DZQv2q&hQDHj*~#tlCHW_`Z2Qz1BS`_5Mi zH2P7D5JAbZ&fq-Oz!iiV^9suiY7y`+5cK#*(dE~LW$m+SdL9uu&+NU=OtHL{#&%V8 zb@i4vlZ_WKf!iiNUACb|#N|>U51M=v+|eBriK{dqyP!5q%OL%4@;V(k0WHzonR3bM ztmidOH0ZYv@x4~JiSrQDGiOpyEGsGuJ(e+pHxG5;z5YtsjGV-E z3fGypeH$bRMYfE{peGXe$l(`zNQ?#yZZf`6=URl31Iy|5G)J@2+@7J2LB^kE zVCadS#~a3G9Z*kTj+a>MT{AN7H0+>OQxCKhU# zZr1xcu}m@1jPf1P$QQkolfWHx#qRzLNNL7X%=FfsTUw~^$I=my+8tp@ElCb9QlClcw?p7y0|yC4fHv!}P{ z>5<&!)X}%2x0$!S(YLPU)y`{R`e@G~`W(Nx=hwjbvf*}3pF!x>;WV8PO#JH+N9%cZ zw(?B(n(j8lYsi)Eb_t^RVkup1O_5)km)>-}Xh(5nt(I+_-L!3=zTrA<9rPc8t?r+8 zw)S@Rz5o7N(bx8BQ|X`F2hLU1&MaKfGl}GhGg*ZLhrohmR|I)ifDm zT!vO}=_Avrhyd?TzRB;__FE`dGFwEtzq*EvABZ21HqrR&-S0E!TpJOICLe%iDQN3U z@bRC@Q(S~`rWQ~qkk4%}iPD3i0uE|S+#bh2eA|xhsnP}Y5U)fP)~;_S1~`nEjFjXt z{M#^3v0}u>pYFQWydvk7ZHZBL004a#yb+GT+7Cxxe5j_~oo&vuPU~N?{pkD&+8%EVZT40+)%{ zY=--$g7RpbnxUmu@{XEYHmGM)dzeTB=!;i1RY$f1v_8QB_r+f}6H$$nzeB{LNEP+DzHpNG=Wi@2S;~GT zA$xc#Of9oJB()z7Q5e0VcZ7}BX}V{b?due#CiL2};A}qs%yXrHz^i2Jj#@U>bhes& zKr`4Ocr*Yo>TNC=#&7u3wZH+{-q44rEH2XJyInK^c4ud5XB7X;fdkTbgh2=U8N`&O zG+`@4FM`M%tTC-q;VZ~$bSM0mbKe#$wSahofS;J-13y1!Uj}F52*Q=UexpAKGG>zh zNDtH4*44k!eVJ^*Z^M(O5kkY8NVi&pcrBRmq0$A2$uS6!Bd<_ZOcGD5x#=Prh@i;BOU)?1!8jA#m?MW(QnH}XdS$QOMApC z+=z(BAcH1q9yi@0|Ge@wqFXGmn-pHRip`Ksi(N^P*Hr)s9w;BJl1y)_30Ny&OY;eb zggplgn~>z97)jjufb&fkh7;=pSc4Q8SObM92Yxy|G3k@wL%H&2g^40C0p9Nd%c zPeWX34G4PcLDLTABfHF8p*U+*`OcbQWAJ~l&&d@&&>m#+Jt@N)H`}q(^GE0)QiST5 zFOvl=bwi(_9-G#VrT0$DPYjy+7Y<*`2Zf)-gcMKboyW5a;iU7NZ%J>gmOES$nJ%K%15)`?w62tt#0Y0F=e=g z-{G2VMBytq!Icdrehe$N4>T|}$EpDH1c(G(9({U$%gKJ)p{2diF`Fpd5GnoVHj=;k z7&dGM7fwzH>qvn)R8>wVf$J^jZiaSkUnVzlQ2*XHx31ajdSh)fyx7d8!{;f&@nk6( ztlxobYW3vq<6@vy{$dwzRieH`5}M&NOuK@H%_oQ>6Cq5yQr`IREMxt956uTmz6(<5 zu4L43-zaB1yqfaKgb=^dN>7Teesd(lDOG>@o_xFLS zog-}USIctsD=6^#*TVn0ZL0&OtNST{wn+0o4H|sA@qkd9hs1;yx9qe$BHy5?j=D@I z`unO^XwS}n$iZqQebsE-KG|lIQUeZS!X)B+opO68UrvmTy_X*Z@k?CsfQ8hMuc6-s zW-T%b?E$qdUA%02@fWMUx{0EP*vF-QsB20k=S|$|ZOC7{k7^1R9C^T~mtLZ65ZJ}3 zJ4i{*T(urEdyEKNmDcSOfmd&^iX^+uaRx) zCp$|L{UaY@A5z2qPYU0Bsir;2Nhg_Oq%Zz9Rx@Vyly*V#F~z=Dxj~OF*=4>(^Q!}*I|Ty zhxlSf(Q4Mh)R)04%~tlqFI0PfV>d6FI>eYVNV8@nTAZ($U6ha9_Huwlf+D+%eZ=Dk zEl{R5Gqy)bDel}SKQ3PF-GaV^N*?jI89%DNcz5l=%f}8hPQEMgtEVJ^di6kSGz(h# zW$9z$VhN7uA&@|h6X{DFFwTX^VK`Y z_|LkN)TjTCvn%g&9Wq>RQU4LDZcMookKetF$I(Ocb>EocUzq}jhOUzQF{dKQ7yID*{o_r z>b{v27C`%jeEZM9Hd6XJoJGTnpuz=paFrd^FkbQQ&_z5?pk$IL{s8Oiko?))OE=;a zWcwM-U51Dc%&rk5AnSLt=nH~`+bEq~S({(PXBYE0qgc!u=l#XRn~hv;t*L99nf+iW z!5ti*;SgSX)}lMtu|1xr52&T*Fr88bB@#N)p0_;e94|HTv8;t}0be*-omo zZe2iBR>4(eKeZ~I{1E!(ZnH9|n9<cJNJE(%44Ei;e>0^&$dVu&iqb!RD zyv)NH$aTPPT93cVejV9R#Z?4Onz-q*YfwP3(@tH;4|R&#Dl>G0ej`!rOMbQo+WF?*Zb3?6iM$w#NzMn! z-|3P@b_iAGn7c@>HcRCo7K*jc48N%OFk;y57$`2CxZ;;4$+*49i5MTel^){+#l^94 zJ>HqC#BCwThEV9$J$(C?4b`4>UTZaOram%=0; z5>6u2pXQKr?V`r4=q7uHxI$kjdTRD%FKrsEM}d$Sx-L%CHAn6`&Fggfubvsa=m2!s z2kLS*4HI8^IaE>>(B#(YRi(P42Qh_Ny~2RP?{gS4upMhUI0Qr>RqvGHMk%g!P z+76|YS^z98r}!Zt{bgzWML5ZnlgS|3nU5x@d}-wtEwIw(bX@f#3+6}Z!Uatmf;*Lf z9^>bS9%B?3=puRa2- zBX40U)1mW|yuUcFs-FmH(2|2R4pMbU{Y#~yu_(svn3o&}O`H?d0dr_ISFQT? z^%&xrbcNHOky&0T89F?{dZ4}94*i5_`D}nzCkwq%o5>5uiYIp8ij{DCo8QRlgmuPtbcI3}^}$Kcg_PUOH0`n-kp&$T6Ypc7~uKp8iWAWG&q zF>HIkzlFK*8(+64@9dEVO!I9KX7uwu zfIUN-Vs<-`t{908@N6hmj+AoJm~5h%7dA%0xbAYMIkX8&>)tEb`IVEi6gPc~T@~~s zy>0|YKFg)a&@uUyc@2{VxaP|y|{5W~HbJqlLeu=#*C4Z!_+!{Sf?>aXjwCQoXJi9P;$=?{<* z;tKc_Z#0#g@-09 zZmjMrSbn|%4h9oYzbd?xa98QpQJ`2#3KyP{?<-5ai+;P!jvHqr{yW=h8+z&;-(nX8 zXBuZEhlzooGbn1?7@Hw)9<_l-As;AYiCquEmqtYzm?Pjq;lwLk2BTOFFj^_W1DBq)X&Pjx)5up`D(LlXxP=eH~X=|6iz3&xt{e{B#O zx&TFdwJKm{#CIs@=CVUpQ1WUD^^OmKZC4Qf>Gs_m54QP$A6$Y*-)(x*Zm5|$0<94n z%r!j)FtT_AEwq^F2T)a4f<>SVj=1w*SLS7$)_^r=oW4qOby@R|qA860X+x36ynP5@ zPkY9)eg?`~a(lzKRAO;i_G0yXEEXMVi{OeJDmuMJs64s7=IBb+!*yg+2HcsxW$k}~ zdTq}~G*U~(Oj#3{+-aLB;Ueu55e*df=7-z>Kc@zghYbo5uud!g8k$ef@6NsPgp2<; zI`c4Y-5nY(?CN&;U~c3H&uv@1rCKTe8Op@dM>2NZX9U&0Oz}EB(l%G&32(^_e;pVX zGS6$iH0JjK7O%#Wg^gMv4s}3=tF$K}6w{z&j72!a>h*|T(>f<_=>O_RIN_y;Y>4j& za4zs`uSxX0rY0{3IL9VLLZ?=l(8!1u(;`jP(*9DPruw&3UfHQ0v>y_cA9ok&uf9^y zv7*|NNNOCbpBn?T6L0G@nUV_NF7-!c3hhS7Z&ZfsOOg zOM*gyC#2njmd}`-VO&6shxoyXVvbJ=$Rsyl1{lg{NL<_H?T>=hka|hki9U(S6;hh2B@1vJ~?&^rKSjm5~>${Ozm4;F#cF$A6#L zL@z_C2P=_CTcoAU=-Gr6aThoTsjl8$Y6`_O9b8pgxqvuFb9?_jSp$_RXkQ*F!>|_-)$n3ukK|V?2G2U7I8i( zjy8s)iT6*`ewgPp-ewMIarl^rCLXWduRjUYD1(zM+j1%fRynl1TObbOdkh-iQ)6!k z>bLA6a0!+nCrNqDa8GA)Bant{03CAClk1}4^Q-bzU$F&ow5x84z0`7^TYyQDl7eSe{1ao+(6M!}t8r+oepvjQ&$G#J&RP2ca5uD(MSn3a z;{>jWJ}yh@k~&diU<7@Z-S6}^EPeTfiC7*zu3Zp3h8>N~;J%i=7Q{##1)QL9y{7xr zcMEM~eRjNClp9q9p~f3wsh28DOC!T|xwa10$40X^iu6qn9els^T2B~Q>K}f!iJ=Sy zrUyWmLr$y69(@1Yh|2blQE*K3crKB6T1v~VUtdXDPD}Y zCd&t}=gQMI>iUoHJP_R-7`8rV`x0{UyWS5W`(?#!J`1HLw@Om9(2qA?)JKM(|L})I z`aRp44v%c82+w3~)(i9(79)q$2N-;fx5b6D1!xQtK-lvO3Oa58@z|;0xZ$Ne=iX?K ziyyI#a1PZPDmqX2OITY8%ds;$16^rV>KTFSK_2YQ35`8IZgs7< z3`s6Hq+{jMsgTz7b1#a|o8sCW7jV z8m;HLFHP!7I}7*3kAtlCHLbGeOR3S#Rye2=oDo?GhaO&C8}zbzlpO_g?qi0<9e*YV z?+|y*Zyf`Pu4!n7H2k{vf`p|F46o87`=`o0O?)25|L)7*T_EX=ByK;SM;6PKj-xFf z_i(??5~R;jWTTHq>^}$mKsGizqRH~61q?xgel~g#SHU^uKS-_T)`$qPH^Jw)Kh5gj zVr*@0uBTrR`IlMn2MkJL1W>8(K#snA7yMzTwg?4mu@a2IP12NIz&O{PIfRAKd5mpc zoMQgKnSwPjLyI2p#TdnU(^g4yNZ}X{(Un1nHhG)L%qy=5(9|RFLMdfu!-(3XlGM`=ls-wooOk z%m4$Z?eSljb9ORXERB<#S|P9{ld96J??)E88|c|!AIv3=50QU~!5Dlo{~L27C@rB9 z&JNiTActhOcPdfN82&Ve={w^pqc{vt#^MCM-xGllBc!@~GeXzhz|03QxewP`%Fhb&WMPpT>091UJ-RHnB(1}6 z4>XV_{tVw)?y*)Oloq}5$w-pgMG76_oJM zA-^!pcerAJmxOcF6~HV}1K*jX zH;9?IYKZY@8Y0}zEV|iEaAKgJPA4niKlcg|ob{GWiDG0ot;lS7Aft~#-i}? zG{6rz*@t(xw8)0Q=9%}J>WU`iB1lOJnR(d}x-2)+$Ha2BmVUl>*Ta1cDNzZShDuSm z;R+M`)uX2}`bIWfDxw&U@(s;^Zw~%Xnd0Mi$kefjG16M(Ze1>}h^9f+neJ6$TeIFN zIh=DfYd_g7ghx^)N2kU`CvrUpr-c)^9fqOyK1aXSP{*8nUeC+WdSuGU(9%#OKys)? zij|4MDQ$0_av(hgRIW<1RC9sqol+CJUlsiubf!J^ z+>PnE%5I07&mw|3_7~XKhgjo0C~IQNgF3(A0!oDLYwSh;GciY~aOcI%=>X5p2eGL< z+rLsJ&#{ifFcWP0X~lYE2Ua`ddMEJ#BiCD*`Rh{LPM-t7Gd772Rsz6({H50qjWBpZ zFg$BH`oWLTVzt4{DR^Z&E5oG$+HFm75 za4g9XuCSWOSD6CSVn(>vi6p3jP7nrR zx@r@5ZFug=vK7E?^uheEK}Ksfw{k7^RqDy4glH==Q`kL;+~H*~fOifa+ZtyFZ38_3 zYfgM;f4}}e_->xkiKopR3l70mag`~Qk0Fk3Cc)F(bguy8i3q3hmz4h}WJ=S`+|Dd|~;!O9)W%eO1jDi#u}V z_m#tmMVce$U5e38l75+MnHCj;Y2>+#DJ~`IC^{cy|BN-LAwo7$rKa1qR>$FFCmFqF z%7zOH`w4ckq|GdpnFX(`N@Cj`E6E>I<%YOnQy1}@rcy!+nO^EerkkXJ=Rp_;T$1pF zGpw_nN2h5u^^kSnH_PGnic!jI@hwP¨resT1j)Y4|D)m{)bmMjTPGJXw|vv&(wS^0si~% z;sqikldZ*>GBQ7S-JTn;YJF6=_o)t3 z67Z420|?pLqD{5x+7UsOl+5iU1QyS)A76lrEhrvY$Qt@f1IrQIhP=XH`=RCY>!%-| zta}jTYK3mn^g*el3;Iq)N8kI`>le6zuJ<_(@1e4rspZ5kBGGuL#js>a43IVf1f)jy zNXn|Ze)iUx$Wn$nWd%8rm-H4iBE<(jn`pBZeF36VkAhGnFjEtt<4FvcP4D;1uem@@7{KI=%% zD>+@87@oh4v}K%e9fBWK2k(JK$An+PWi{ZPUPm+vPK@Pt&vVP@*)ou`~rghnz{FTxEM2wnjoL3m!}^C(k9i#bJ?(QDmH&*ck$ZXv#V%P*5nW z2&SDEh`9t45fJjW2;bmSpE1Lm$=Qqhprd&kzf$7sFG=S0<4K+8wqQqB{0k5~vlKkL zQ2OwbCqI8u5|SP#`)3S+18xE3yd)fl_eTX>;FpmU;bfiZzf%8tsMza_7gpO!rOIK4 z=VsHexO;rcA~dPHbI>uM)$Gkg*q%bnIYsQ+h#%9k?7qOM8|JObVkrx;zgR8SSqSa* zL$6SOhTwCW8w_0@>9}x;Mg(=Uztf$GI)yEYKwhOGL_*_P`j78MJs%I~Ib7y#1rpA7 zb}1plvK_&UmN$J^0crPwwUs0?_Z5`S@__(|KUO}-`s;}eOFtB#?%~kznhBRin$nyt z;$6_=4?`kP@pXi63=Ea+MYFnjk6%vUWQKJ{c2Ajxp3~jJ7`Ei;ik$dT8 zrs<#A&JEEyUEdN1h7QSh;#j4PK&uR6vACVaH@$=MwI>6gsT~79-UXv9N=uT%nrtFU z#6zTLMnP4plxTav3bW6*D;V?s#AbA4Jw7s9*IB{8A1JKc*WF+WI7W{wEcl~V!5%1q zWgs;Gng*7uo%DT#C_8yb8wHbUwdZIWtXj!t=1vp7S`5|=jh`$#nj7z8=`u^zSJUU) z^5|{pr|{}V z4Fat?gO=3>sl|6WS7JD>d6k4LkgK~ffYb317JPU%dUl5tVXHl>EI?&gW%D3`iEWNE zzj#O}7^BB4$^=eHM@o-7&uqEq9#bLEOQ(B0$Aj6=4FiUMT!TKX9`x@lr zWB&;>yaVnk+CS6-ya_;xpr?ZXFh;mi?fREPms_f@fZjNvoaOHBkKq;(PPE3LK89Wc zDi!6GR-NRf^T0BWJ#@}L*{_&v^6z9?_-KW$)JG&Y7!{DwKaLWmha9ue#y^ebY}Il~ zb{IE)N~(s-=qr!zia=g@v#fxINDZa>hC#b!h5;1gsac~OqRg&y`H~}?<^)N{sG(5?iLDTzva5QyGNtCudj8mrHzPAcM_Gen4&FBj45TqEr;hDg`62AUQ!{S>}AYuv~2!@V9KOXK7q_edVq z4v)u0N}pueAFHKk=YkD{PD8fD*58@w#|EbEaQb)?d_ceVTrD-DUVOdwe>w-mU6Zju ze>P*H9Glv;%?t93yrs99<2rhdgjOzU`DW~|;Az%*?_)!Qb9)iGrp<&z?6$Y~*TD#k zIl7;DpB4FdMd9U1{}*36fBE}nVuKHt44SaF{z?A^sx0c6{q}@Rvv{Jg12_nmr`BaSsj!jF3MVmeo*LE3Y@cc|VLE z2OqcQY2LIo<+pj}$2?I9;Gt?BUPFPWQo_M2syT6~GmE({Zn2m=|7RRcb1}f%!x<$(l0U_M+7=>cd zI+{dT9P2nq5Y6;*HA z&P$`++MzhW8>X~b|J}Q+1fppVXuU19ypJ3G1BNS_6y5&rMNs-J_}?nzwx{&Rl^C3} z*nB8uT(Gz;hh{V6j^3cA>5|L$Jn`h|?*yU!^%+m>bTPrMtZ-B7t0eQK=hc;F<{mY? z484Q3CYn`pNj{V(rUh_k4Ba}*^`-?f2(+sD+@nOY6j z-w%#B&-9j}om0pp!h_17QZhZd$WA0_>8t2syNF15eR02d;@BpoXWGxclSslG?k0Ay zUW|Ba>B*xR|4?YUJzWs>>pLt;9NU{VoRk)79-`{84S!DF9X#VZshjXQsYCUsvb)L2HAYg;JQm`XVxE1GbK<*O26zryX6dv!A5^nJ`fV=dIfQ{E8{O=bBz-L9G= zKQcv3mlaKQUMFG%YS^ZEfnF9udD-^QUw>bF)M_c?A{R_3`-LjW8fdk!WcVKdJwU?0 z>OsbmToA7`3uTC^e}qgGgOe=_i%n=+tYSrFV6z)HxVyB5q-mWRS}@Au3hP;yI4VkQ zR#t!^?uB;-jd~%EMG2Z-pQf}pLw0Z;r4!3in2EU)(GplDR?>6v`I0j42x*zWdMmS) zx8?Gk4wj@UBunwB@f5xGkxm%e?i%!jJ2M2VJ;Tf5+lm9 zmdXVtX?Gc-u#hwP$jh8N(({%J^1Sy*rW0|$t6uF#{5oXOoPIf`GZ6m=uIq9UT2`X< zBS`PUF)Qm)>`!DmQ9Q;^!}^KHz84&)LCk0e5cp5hj z>E0>cgz(lfe|ObY-IYTzQG*qFqUMqs*(LD`Y=hf+OGx(k;Jj^6Tg+h|#&ViXPR)X@ zr%{fj5Sgnz>+u}*#GONSaAC;>Zfq!)ULr^pOC-;_Y1q^CgXVw4q-#azSUf6QZ^$JV zgL0EGn=z~3n`W-D25wm!X9cM=T-wt+#|o;B0j%Pzf9k8(ulzHUax+<$vKZT`_r#(m zT-W!c_wuq9*YfBdrj$$ffDwsm<*X3%hPsWwnxw!oJQm!L*=~XAs7WHcJ73r9G za&~ZW(rG=S`QHyZ_GnRVh=z>sV&&=c~U z$#iLnf6HJ}X3xN+erBFbPh>VbD}o~Cd3lIXpHNzQl2C4y2gEv7Y$x^1)R!nN(SR6- zcpZ1&;ysu-ml47;$({aG zf2=>TD{+GdFeO8h^ZuI_g>`3I7?z!Bp|3i#GGBD2g;BeXGLvjIofxO2z+bLp(#i#` zjpULSF8REA_IXJbh{AKf7Yd#^!~)OEM@@gDQDAPPqf%#zJmB3VII@= zyl_LEChDpzjd{pKjyY*1wmk7%`BE6gOOu^<_qkkD6DFdvar+!+6g6X>ur~?TWcw{$+eh^mqAlfudSxMBo~Juv2P?Z zrRIH})SS(dX85ebs-e_i$>$N&f5@S8hLUXkC$6t$*-|_m1eaG$o;rRYq2zBAknQ;- z6CN*Go$RBe?508UK~1N zRdMA_N}V#_XauZa2dX>9M)f@T{OXcNesQfIefA@{JQ{EEg zZ?#mMwMDOUa2dK%T4wGVX_3v)R+Jc`N+tD!PqJ7o7n@c^iw|Lz^ob*uoWRLSZIPkK zCkv?aE={Ja0~nwM3$9p!wLEre$x<}ZfkpG_0s+H>$hES?>XgZ@24W)MsUUe{L*{^8%uW#u6-fFbQ$x6U)l?4UGU_Czn9P?FmYYYdouY z!GsqXGx!yb%veDCdMNSnOmdL%5Z}~-FycK&ilYj5xFj@Jx!ih2xtK_pBZ|?*`BsK@ zne0$5hx+8`Pbh}@`eYxLqHu$dj+f>0Du&Wz#LRbOH>Bt$Gl%4Ce_`d>d?l(yQKFH2 z+V7+wD5W)*DT<^>d}PeePYMXWf&P)oRi*OsqnOwe(k_<9g7A#~X$GGL`RMx>gQL?r zy=3&FX_Y*U(TxI4%!f%g7KPieBvdhE(zz)qb;EsH6m9fLX@X~-ln&d%Wv3}Q49v8C z*ltq&B(5509(}8le+v$Xh(DnRZtM8i^Q!F$Um4NZ(_t{n`yiMXx*{PjwSiF!2*hI&Z(q%;c;L}=bLbPW0#V--nr zsgV_fl&XPzw7R0|80s9o9eu@vQb96N10h-KxAef_8?;`_e_qBX`5e|!pD!@04*Q|} zGXz@>vS7;&a|pHUengbz#Ycou0+t1i!tlpN6wYX-$? zU`joWP=Vhu<64IZ$+;&rTWm8PG9(<0bmjBYfVgFP?(vFscQiDjy^4c2yf!Uk5#Nz~Imj zh7YRu>^kN6|B)v?MGVdOgI4)u&_h*$I>#3`^e*6huVwDto>d^Ab(S8#IsEu}+>d;e z@=X>Dqb;te1I7GOnAczt#xKqTF-MPQ_=#O&J}n01e`I9G)D#o~=La22ADm%=L|lo@ zIz@Em>=&3e3feRHbM$8+H_+LPJD5;exac3&yJv-pF6yVQrd%shM|Uggg(r}}SVp$M zg&Z5k%@(~61KhiSVzk{XqP3&wwFqj^#~x7v%jqIZ7W6}ve7ApAa53fCcJqY45P1;R zK=qVuf9lts_=E%%upe6mCLPdx0^GwEjD_eB*dQiz$`*0d7l60|lSI+E;JAhj1u%#P zttQgc@L+?pI-wpN*Z}nQNYOv503-)mn9q($!jJL#i-b*$d#QxwdKf8GMrI?Te{KyQ9FVGNgw>wPK{;_`)Q$QHYTftMO3_U=$TWx9x~v4RBS!+%AL@l1RDn1q*K6 z{qj%D-8$z7=#+}f3JM9P%E+cxYdKC5Ws2!3%c@>nhd;X6DV+g8}+H13&F+l4BycnAvRh+9F;M-Y@R z4*`p|JM-VYD@McaPu3P7n(aRS?(W1s5JwOPnxWDYmxSySz_S2&e2zztui;e4VdLYR zj==Ir!lvtWW6A6ULfO%=%GF`We-?5KiW-e8ZU+XS%7|D$9`rN9WIK%QFzEb=xrj}9 zz~ClD!$ye4k2ZCXPJ|V$qctdtYq^@;=f|Na<;$}dXrZi{jZv z-RfUyB+D&&2d5@M$}k2v+h`0<_B;@1jvzDY4n<7Wb|rwlZOkt&_Bf)90|&j_{{kyO zvKx^6i8Z&cDxvC;coK)K$NMXAs5 zbQeTNL<*5+BLf@dK|%mxN_ox2i;KnsA5;hjR&h$pa~bd2D+$g9bCI@ts@AYx}V zp;23bcoXXdX(sQzveoZQlIqy>QA^C|fQe8hA!>(pY*NBbe4ywkiYld!MUqng(soBx z^@-BQa^@RUe=)-`Fgi_}LsOjmkYr4c=o-zi@E>?k0PaWUiI~y3Lh|7aNiNLzNiI1U zi&%N#lJQ{cm}$UTfU!;=MvDFA0RzdR36<_i1q_=FlcSuD5%YnB8e)_CE49V)!VQp} zA#0iUkk;``L>ERN`qoB{koI9iM>G5iajGnaQ(P}@e;W>eU|C2<7oemtcM+6`B{xuV z+MyOo9dM*QP*K$kr@{-JA*&@Ep#kmi#b-_f*S!ni{cSrt6Tjz7hW9c5D8fA(BuL;7JWN31XE`@bs|yE2aTl+G^D8y ziGrn}e^K1GBDi44M;5&?1ecNnp@k|2LwpFkSEMv*tIV>mDIZuZi>D^IcX9wrIHYPx zTVcSVNX`xYNisY>xKuqZ8Xlzb9#i3&;O08|^LtQHhgjtZeb&;xV8J=>s#cQ`r{Wr5 z!m0DSB-n8XpqAH=Wc1@ey#Rdv@q~5rv_`Qaf9*1XWMs6k!CDHsgpQCD!aQYA*UYDI zt=^|_sou3F@mU(Ju%TH6Tsuu@4HauJabBPiSHvnNWqj4q8o%*lGxt$&vzVu1@8Dz0 znS$LmrRoQXQ>B2es1Xo}B+(L!ZorO6##dWG`dn}3S%m*`gOLk^lBC2W3q=$+YNbq8 ze+fMer>BPrK4LKzThfi5f^u8Z!;F|W5&L`BaNpEICS2aKoV8ZK*5H;^v7DII zz_SxM0xH%~zEEt~Y#H)lWR$6kg0PA0f2pM|q{HKQiCgv5k&x6rI$@dyy;C&GAKNRC)& zVxC8}%q-^o3Lrhw;WA)3(!|y{90038*XkWLD;v9ah()(WJ8ftTS5hpzeC?Soer zXp2Ev^Nn0$rAr}OrTnE;NwWH=6p9l8*5ybS9?LSKOf&z#i9M-|W0Fl-XsMP*M7VGk zj~ixknZp0h#CMBXE=*AWtCLJRC*&itkC-~-v@^RnnctTU~bb-P1i+a z0OTZLLe&@)-EjeEIa=;bvK75qxCE4R;4e&6Exxn^>zs@OQ068cJCQh0Cs(RmNXEx& zOBl)tr0iD6%67~(e3~;opZKbWT{Om^16Y0^)DY*Of*=affFMN8eQ*x4f8LXC=APd< z2b`whvGyk4fV%?R2Sc1~DZ?5O_iYQrMxvh#5uP;d6{Fa z$BO)G^3BwjAin`|e2NqA)1|L-Ux59m*FEabEjbGL%G^p(?Cwu#)q4H8=e<5DACFE^ zwE~FecV}uvu@YZsuV0(*^`9Q~CH2SMFTMWP;^7}pk^7okM}FP7f7wAqSXm2@E+=F> ziCqbt46uAmr70Zdb` zK^y!#05RL*j*^?}D&13BQ=e0N%KuzN^U&DYyh2=$oz2a;&1C)zK!h5vqS&#Kk?dV^ z>ekuhIIK9JH=2TLe?~1Hg&pWY$vxzSwdS7V&aGO_%xviyhq;Ld-Z$D{U`Fw`Z`JIh znkGqn|GjiyMc};Xg6~WL$mW5Ji||x9R3YKlnVPLl^=nVRB<*n2%c@EGgh{Cgs7)^! z7dbyZ{&XCy6E)B(@CiEd1fB2%?v$eS?*2$HO+34NOK12zf3EJ%Ov6V6NWFkUMIpI& z@Ugxle$OlTnO`Uh(+=P8zTBbvV{Ph}4ZIzn7#t7+3r3GRL7ptncDntDHl#7 zL_Brs9e8S*f2b;Y^-M!+++g;I&31EJhW}{94@_(oO4c&iJU&*{He&5)o)maQ2Jjo( z*xn>&7rF=uRCgpm3nn8~aiUX1RV7?qBsS21>Y=YNWuxKYk)0M&3$SS0eQ8T3*qWwt zL|%2U@#bCg2Hr^A$29t z`Op(s90?H29HBS%;c4=aSRDJflo=$wAIM7Xr~zpf4!!Ton;P`_uJ* z1!n)DLe?et{Q(RzTuq7Wre=VAF0Y3O=%`FY_BWcqnE}_IttRRVP=8i9mMGA_JcN}6 z)XFRKf2l0t{L}rCU3q`%SAN1NZ=EfSA&Urh7vwyJ;7UH%_D5B0pF)+>>h8d0Ceg@p zm0(4tfK-jo{?sNfI3YgQ8#=LIC-Z6;n`rhf=zSvNWegHH0DD;*xYs}f86I!(lX0z(A;B}akb4pkcQ*h+2F}+ zd{YI!CUDvs6dJ1a6EON$3=3igwAnH;K;#Fjg4_@v_-_Xrx(aUh{>>NrM${2!*C0Dk z-Jgby##NzdZI=zzj%uY6J(l=wUoFeQ%7z%&?Y~R)vyEyP4mEp ze^8E!8d`CskquXf#cT{Za0n+A!#12#e!w|T@3|G59N0@ z7W^c?OY*zM-<2o)UGu9xdN}(N?t@J%A#WZxvE;FqJoY<1diVot_vqn|k3-3iKWa%h zX7(R)*5kD`FZuDIlstLzqnAAS#}6!7ipQ+#F{@9!4Kl1C3if5~Gc zYg4~To9nk2z_W-kHza@5BOO!v08YY>@3)z3*+T$uXu!G@%lCMSTaBG#FZ!Uf0h12U+KzY zJkQ#S>|R@8?bcS-^wrKXvnt(uZS{vA!khnruV-y_4bQ%|rdL^8TVXfoyyaWpKv&yq z+8c=dE$eDoFB|Gk|2blJ!gV@twLAU$ngugLbO#W9D}5_-r+^{K7)^gkI*Q1k%Hvh^ zDF7_3aCVI?oc_P7BmCJ~B6rEPFB zu!0L$zGp8!i}>~8-|KMKrixQ9w7!HXywd-0O19+^pB>fX(5k_TzkHn(3MbTB!if9cp*Jd4frpCfM( zyAC({ylQjPpr5z!@>Oxm1h^guY@nWjCxab<6>M9yJZiC8!m3v7kR&d0y9V1zlW*3C zs|#r&zyT-50de8wT(|C#oD>o;1=3l9M&70|5@Skv$xhPVLcj-qlUvBtW{U+#)N+Ju zM<26H$pLz|JU8O_e=9DO87~* z<+rP^wFiH&f0aJZ?Y@}$eIIH|@X}F7+PDwZ$keuFjnhL5p(?yRD{r_bS$Q`CD=$(d zR$eJFM&5W5=zH#d*$$^6#nUcd-EbNj*Z6>>zCu2d>=F{8um_L+uQ~QaXtPM9GEC?_ z^epoJ_AskKL$v#FZ?E6!j^D#9819lD0Lti;6J>Vle`hd8<5+&aZ*IPE#ohM$W8`#t zy1Ul}Z*#$4_L>^lcW=CuFjPEM6$W4v-SIoUwrHUP6?koGDZOGioJWqsV_Fa8`OM)0 zDQWb-a2TFWepmR*hlL(^ZACQmVmFlL{@m+ge?=$Bg>EMm2FHQ zg}wl0f6xk=)DW@)Pg9bb7h=j#+&@>(|sW*9bCKo*HBo)I@(+D(R^tP2{s&Qsdfa+`d z{@pK!ov*b&v@&=3ZG4^DLCya!ZSTI6Xk{v#z76MvWR*UnoSB z&0&2KAV*g{FzrF4ULRzE4Bh zD=4NoRB8+1rH5on+H-qO}gV(PH zuh$2!xA@sw8@$;ZY?8kDYVdA#@W-pc_H%$EIv;*|(Lgk9!G8VMffsqy(fMiIZ)#?UKD4sBf{ZSf7*S5O21}8e-~bx zkizoJoL%%z$}cNEj__$SS>Hy6_a0Vrnt~Y3qI!xwKcLLGg*8?HnD)*`Lp^P(cO5on z?CQ1txb>%}4f{*~)%&cOw>NJ5o^Mrh?D-*BSOu+E0Sgrv7lvvJIHoWO$tQoDVAb~h zg*dcLSj7A8+#VBq+hzH5Xk^PBf6yEY!IlSIfLjhS0~S-WHx%MNmmcbbhg66=jVN-W z?g{k^Yd=Pd1419XL{pIHtrQ9EACJD?MB>-XKx3f?Q1Ki-vp>%&5?Ne!7CL{i8hPs2 zgSPtGOpC5d&dZ5e-pKXAfh0n z2sORB{(+j3?y@@bdDeoxw%g6W)@&B5v$yoTgE?8~xz%lfn_V120X}(kcyl-~8t3yt zA;ACG@P|Yy9F6Pv>o?lwAp3!OspN!8V%$O>sxU}v239Na<^0Xl#WOTnVqB4aKhl)d zvN^WBLL9^Kdxy1Q+h*8Se^oKy#})18E#a9tue+Ox%C(?%M7z$4M zLx+U$UWg~z`lB}CSO0zX0XrbRz>8SDGTB#5@cxRl+haV%9>s}7$n@eY}&~Oqc|}UYR53((XPlb=v-*; zBviVWlA9dLX~_-;e_M|-%_vy*7Bo=D?%j5zX%RcE*S+kkAU9-Q>NT+BAR@%bdQ*R6 zp|^RLEO9TpQn?tF?bjTH&!RQpW(c51;5!Zag%jz%32K4QTF1olPJ=0LG>?}}L3GLM z=Mn#|-v8xM;q|YN!z-o#9-iN^qlz!j+bEI;UIL6&k5LBMZU?1#dUo?$Dg1LpN6+qo%xRPSyOfA`HxWKD;1llgQy6JA*}Pl!fF z-q39?+n~L@z!yBeVmC6AiQA;@JM7xaT+MFGNoDm#{}zUa&-1FLXyQ}v`#W=ZV>11w z`EYeFz3+2A5?qwOcx`%nRDQ+vu5zzdo1TY`skQ#d4caH3JoSzfp3GW2;M(kyoMwSj zwqM1Qe}PmBMeEn|3&3cm$a)z4^mJgwTfHrU8C<_bDFyVgRX?t3cyuipmK%}N)M9vU z2)d+(LG8TSTWB4XN#Rs}GBLpOik?tAyLmwt(-Q@Mp$Hh@aqDtCMM2M^^3~aR+o6#VQT3BCRi-m93mr*L8UxnKfl ze+<1>_fkojxU}zivA%39ss~qqr$PmV0_K@%r1zE1i2J6woiE!q63<%onf+VkVMk2i z;UWjmRAk;~p1&(w_pP%1&1d`DUD@<$%6}OmyowS4&xMU;@9*y3#IWwS2}#&Z$T=>0 z*KxzoEGi`NM_(xCR@p1bZk;Xjc5nWhf80pP77FeDx?WTB$SK+ShcF1Z7n?mpb*?tRaAyFr3? z2Nxtb$y==b)8pX1DV*wjpJ&3+mO6dQOAmQya}2n+_zEizeB5;1S$UgWdD9&mlzn}{4Fog(K>97ABQ~?V z70G~pi!#y(7~ohL&(8WrbH5=}f07_*9Lwy^iic;sxE{te`|cLeP%4Wpus8zSp%WM# z{3DbjMVRyrQ?_qWVL^gGU({f47zWp~a+y^53!D zbM~Hu;MohE`+Za(0O!uX%N1nrW1e<*EAF8(^f;Kn?Bv~>7t0%)&);q=@2|eld9|@@ z-blH6aEU`^nIkC8|9};R+sJfCV&7&c*xFCoxKYC6sKuw#u-ON1<{LiJ{SX=Ca2CP((0t)|G`%9p<0(-SRv-xuQ+0vVpazI+RygRVq>&`@+Ta2kp z=L_a9rZX_kL*KVZgUwOmfP)~4&*c#?u&_H*=`3JwG^7gzf1@>$cv=svgUzj_jV%_1 zEd{PZ2W4|S>OhW1SeTo-6xkw4(6{H9mMlZb@jk~>iJJ^=V|itHdwtYlaK6)BuQvO` zeiuD;=oaXB(lgK%BX`BI>-|nTQXJ55ny}@-PI4xQFA2Jl(24)QP%Y*XFjSe{C2Jnh z$V<;K??!UQe}1hutr`Bt;=R~F-RHqxw(4BKku<_cu`23T#h6RkC|U}&#$o|hqeP(_ zE(#w*(gS1Bms$u^j7QEY(qFOfi#PDadtsea?nGk$9a40s6PJ*oRrjs4n&s?iI_v?r zR=3uc{krzIk8hl3x69C9|IKG_IFH&hVFCQOL{X}uBVsHupv@g@E=Rm$8{b4%vawJpN?0PttW}U?&-kNf zX6T$@e+G)q>VJ&9b=0jVY~{_;o8%;;$PML+W_5twL^}{`&Nj$NmxAXztfPXH4jB`9 zXHeceGO_jRA#>`PH!OL>yn~e(D6vMa!CGW0h72u8&O6wPr(?hqz4cCY*e|uRj>xr~ z|GQb+F3Mq7#StxU)ShlcQS4XazHag~JzsGre;8uHk_j!!2=Y1>PC$iA3S&^}cT3)C z9ocPCy*CTq%;xFA?Sd1H%{Y6!ax;o}vkR!sxCn;G52wX*8ZPH&H6z~>G4gip>Gq7B z^7LXjxNecCg^bcl-~2Y0R}7MVGsrB5iLh?#R@pdZ<^X%HZ%{JCmbcB8YLW^=*AI_ z_I$`+v3#020^GU-0p88cn}xob7@$-MM`_}@X%C@0@r~@cQ{Y_Rh3n?%ZNPg7Zq=}o zL4)_vE&RRY;W7LmBrx_g?LUNHNnapfe;R{%l#UZVwe-9KH8s2!2C)rH4P6e)8D7-S2#_f9LyfTgK+` zNp+T3DDHIee_V4s5QYcxNZmjIp8_Tba z=;2;WUB&z1@2wtoJf&xN!#XU@f4o`VN_YF^fqapVam^9CzU1@9OP!-l5>2F>1i(X$ zvxYt|SraAPLxCIQ2S;ImN=nhym%mQ>UXqbn$z2ai2l^j-c>W^NhWf>&y1M4+VCxYM za8Biz&^f}<^ST zGs3@%(-J#bsX75m7DN%MrlmhR;&||CG@{N+M~a-);D;VW90~WMk#yOs!^q0<>D zdFfQKEL=tQ97vVY>9xWkfAHgg83{<9yo(Au_&ZQ=I`VxTB1H7+tFc$qzHpvNh@n!@ zIpo0x9^4@ig$M&R^GJjWLZfF-vAFM1uVlAX^F^gc`Iwa#cTCSO<<8xRHGUnlNe_-@ zD>;I$gexSxCEEN13XKQN_Bk=xFNw(ImZOa3fRR5Qor%2sB9&YOf3u!`DOb{+4T4fS zMEo@M40PSa0z*hoYnHeVmb8#%_lSR{&yYj{nVbiD4ggbzzEjBu=;9ZbWhGNMD#>k) z{!pE}@!?R*Zk;^fS4n?d*v^t-M>TCsUK1)(i`r_6+rl&`;1mB?$=#A^L9x)~NJDn# zT7{y9^~<$2pYIk6f2?P^PW+0vlXnpIk-XAs71`2|>6cc6LP;@^oCIWNR93UM4g12E zL)~fkd(BiG%I#F?P;S5Ydq6UKvV@aQHfx(99&*{iHH^!P-%5O99rlz?`|M)NcVQW! zyG&ur-1vlR6SzSI)rK|UU01WE&Hul5H_V=(26wG0xwc$We}X^R9I;VSYnJ`O@`nZk zb#5wQB?ly6fCs{3&|VB`H9HG;=!So{sCMG!VQ`GC?15N>!q|f=5(h+LxL-|wx1aITmG;!Fok#4*^~ zp-^A>5EgVve?o?Z%7wRF<+j!f7ii?L=_UDhqY*uA=m}ZTin$!NyugXMAmiADVI_XY zC<*j$YIr%jpi;P^l6|GF*xt2FFhM@BJ2npdq)y0hu!l`t@w0!Ts~HxNyvcd5XwgU@9H@w7QN!+o}gWP zc95EBQM!j24i$?i?%qN$@92-&bjCVXGI}~Td^^_hZQ^e>>7-slFM^BL=ts5LjNgLe z)Zh*Sbj&tYJ~MZXYSN2UQ_a^z*q=BXQ7c+J7w-2J^q>9{Nr4&1enSkdYIm=o_#mKQ zc?9#Lf88?tv<0+gDizrXBy3dGeBm9Z6u3@BCdD{HIatn!>@_2R-3EX57fm$C+f4F~ z$s{&|K~1OdVKDavf;Z^UC_R7m72Wauh4gZ);$V$gs2J2R*oEvmvfoaS4mZjj6d*S1{`z) z#kZ2HR&?sW5JYF2o623JLIV%+L$Musw;0w|e%B&yswA~+5ua~K@@L^H8UGAW)LqX$Bq4_JaskZwaA|e78bL8z{BtZM-e^!rX)Y{}+dF7gKR6eA0wVNJ6Yj?0I@+^3r75XX%M-o6=7msQ%GL0U+;Hb~8UhQ0$)8Mj z&Yr%Ic;w#LV3EBGU<&MxaXZN8{Ap43ViCY!RPz+BvdA{kjkXQ|0-wG&)>MK^e`q(! zG)5{nhG9dYMxoH?yo@geQV^3^UX{j}bYTtmeH|+&=?h|pv1&|CF#G+g0{f|`88v$Q z9viHxl?boKM!!lk=xCdGP^8NJ$n}g<;qNwUl()7taTg17wVu3xM zSAbInm{;}j=Gp&EJ89K*Rqo|zDM|U|m*@AUr@<%Fa?FtEjLgrmA|R`cF8I8)l~B|ROqoR%e|g#%Jh-$S zV<+ktjtfwoBHs(-MQLa_*iM$o#u^nk=L+&oTo>Eol9*FkNO$+jc*j4`EyrX2UcW5Y zw>$O;Rfp)G&|U3`ssBTreJb~|u-fj*k)H{mp0OG<9Avp(nZ{}Vz|ttu1m_I1S{dUP zw+nym8fz9Xl83~_H?dUxTAk`NL~34+s{{J(U(aP7usISkJi<~(VE_rui@!* zy556z6Lt9A-K(D#W*4jfoUBdV&u0GT!w2OGiF=BvT$BMgd_KCdpf4W&bI<;ks^tno zVx7v;P%Z8D_t0ofSfaEyCNlw#$L4-8Hur;KZWpaEofu+I$(87Ie}2Un!x_IxbW+hu z%^UqWRaSM;4aXft=~Vn+XvN?+LHv=KV52i+anIG~JB@wo)Jr(_n?hROR?XJAmy#5m z20t1vGJGU;XaL{pln&SI=9U5U@_y49IV~l;hSx){=d7s6IIa*k=wsr0iKGm!@WyJ{qO1NX{ngp zFADXCVwM-p06-Okr+a0yIa$;_{7E*S1AGxe`FZE(8u#E3)b?kde@2I@pxSeOp}39T)?m6*@4leQe-E@bb&m+^-^%lZ?^9S& zeJjs*su=xVfcWH{3PSgXS}hclbF&{UcmSpsXUP|xHry%9IkYs|GcSc23ibj+iYNVq zb_iV9KBsz-e^XlA#EFx3r}s1*NMA~WP>k}PWv*bN%k#AvuHX0^;#WLJWTDr&6n`y9 z$=@%D5FbZ8=fwH7sE>S)d<_QMUoUGnI6;KZ+^wE3X9~VsZ7gSp)eHWdomEfyk>KI1 zdZMRZb(>pk>*%QZemOhtR=4=;jAg37gG@WED$(^te*;xl-LIpCs++tKvZabL-{)>0 zAAZ#%G$@e{RG0a^s8=tSVYYn6kh$HgzS8sgXM`TOB3jNIlbjJ9R$rpbfUKtaFMdzX zs?V0WN1U7yU*DZ}6V)X?v=w^-zDaeTmuvjbcB+3Wqdh9(dwZgKAEm--|1#5Vsy6jY zn1J>rf2R{&y5Lu#kgKOWFtQVVWP4v-xinV4AJhBG?DC+~>{qugvnv$WtG{1n5P8+L z%j~+&@9O340%u97zLG7i5@AY2^vs+n&#YWL=E3znQfY)q!PgkCiO<&&NV z-I$y`&L$?Yl3&2JVN88usqKCt@u?*q7ZWEe@z0BHG5LWd|Mcb2-$>58(V-i%?+_&7 z40uqwfH5Zf$BH?+*jSfTi!N6Vn>*{xe?83?KLLjUr8uo&0j)Oyb1c8rR(aAWzt!HW zt=o@$zBPke2^l^k7VPLWq<+G-+|!a5EeW}(F5fsm_0=P1FwF*22D`V2N-4*%m}pps zjh8?<`J6}4oJPLmDvA+BCT1v}E=|)Y!zhDvKjIzCDCougo5SWz?*zP2uH1eUe@J-BvE&I9ws75c6*No}8s<%Ivm9KKk&ffxc+CE4snyNLGbgRe$Z{Wt*cOY}Grza0i zFA4)Hb!`|oIbm9cyMYRbC23M^e@j}x=UBJZ%f>78p;fuHz)$R^sLCEc-&##0{r>nw z=}t4FHeoQFc=|D%ehn=h?dphbz&tmHh`bS9BMXXruToJ#4Nwc4QJJ5Ki^5F!KAd;E zeT}Hr;0Sl4#dY!L+D_r?-mUa7kl_yv%ksLc_Lv7mu>vYSR3g-=L(ZzUmWQ-Ee>5VL1;ONN#E2YQPpu2FMbMxYRssgf+ENRBvpDEyJvVi} zS4aKa|Lx?YiWmA3+qGEI)EFJ-ro*XdJRxh1GdC|60`}~&GDb(C`fe^}@}AilTVa>k z=0H`+sap?i6Wi#rMGM=*i8p#2tOZ1*!GX$%_7fJQRtq@iz=Z?^9kuo#|rS6IeMTR6O zl$Pk&s8Ig2=D1JlQd}sOOCd1STCu|x7|J>{8W_rtMH~5p7Qt|6s3huI3ISGzbEBlL z^t;ZtQkf!@Ly^0AC}_k!=Cy2y)+ylZ4n#Y3SQ$P9$Y|k$e{rI75r5+ShN|$6gzoSl zU_qH9jfcDZmUkD4P>1=AxEP*Gq_4C2wFHm6&{-OuC@MQ6ZlvNT4erov0uFY_6t|i{ z<3V_smzA}4%>b2M!+M=CGIC)pgz0)Zw-APQ(!{jgMzsdDDPrS<4SG$TYwZQ|@^$N8 z5+5yd*@+)Ae|P3~@mu&Zb<)h8&ERlWZ~U+8!|pDthlKgPMOCf$DL*sL9Tt`Fu`&hu zRxK-wm3DQgf->b9^;96*U^U-5ALIhaNBosQ@KrwgH6kSC#`=O;IUqC`rtuR#gvK7+ z8Lam(9}g+}J_(y^@ep>=1Q$n7zic<`r6YQl9 z|HBc9YJw?(F|+fdav)dN@+N^R;fZ^MF695(;OD{5wZSAI4F{7os=+{g_n{uJE8m@} z4YI-XV7fM#<^Mr-urPS?g#T*;{XYOm{2SB;PoF;3AO6`h{|5LA+Eur9ywX~kfB)7$ z)jiNZe|ymL|D2Fx;W4z^?7x4yXS}!B*8ie+O2Be!p3R42`2Ae7TOk3x=l60leSA z>Vwk(8k!H#0_?iKO?$htH^R9MKEYi<38Z_`ZlFjaea``bz|Y#(C zY;e^-8uYs8gdCm>6i{n$-M!Gib2WH9koxBUU9zUBfyhmByj2=z_V#wA>in9ZnDm4BM=e#;brEzAL9J`$2{}TOo}hDRCI^WZW+tO2^RDK! z_AH2qUZpY?oObO^?L62kHwKq7A|3#8e`af7sxNtlY41BCNwfrcm_R+!Z?;juIY@1z zu~JfDdF|MeEBYB&kP*SPSgb zT%MnN7P+$aQ&r@_V#8h7Q+RbAn{U6EO&*%Z=EDec#4YoXpY!OjeY~&so#WQ&f5kO= zI)$zFpZJQ`#`GSVQ)j86bZz=nKkD37r<@giL0@7?Sx z5=_6lzwZpQhjnwR*sKoA->QuG+3K66jrVC7(<)A?i>VvSg1BeQSMlaH&bLqX70DyM zeQ=>ia$5V#{GEFDv*kB0_BWSae=qMZZSFr?URix(`kx-9DVhBz z9s&RZuc>2#>IrW2o#kK^o8{c-x{4>@F+v+H!z4M$--2SwY0xj?jT-@MTsgT0v5OEqW(To(F^BhLV94 z(~TDz3^I%0NDMW()Bs07xWC6i{~8Bvo>F0M+pn=p9VsCWUjD`@Ww)yMHL2?)*5&xE zrZ5m*tfMv*cF(BsuYE2}nk|4Vt7HrUTS4mSn9G>Kid6@LHcmrmA_0fJCx3UqPP)>* z4;t9E(XX^G=-AL=RJ+--#Tb~SDT8|4m|_8J%Q_m;0VX=VFd$bhnjhpW`rNn})D_H| zSq??{ie1m*(8aAckx0IVKt0hEIw6`#PI{K<3h%y&W)7n%Izq~it6~qKluf%bSKpDR zh2m-pvr}xA!&ZhB=8YKztAFS|+DAh(;>O;U!R!3XAkR5*by|nUfI%RN!|$l8n#3hf zXvx)1LQ9U5gA_Rt1vboiF<49+DRg~Vn5|t-0!X_yewfsVMH;Ld>v_6cYklsGjDS14I=_e zK`ieybOGM0(TD6qr)+P`ovz9(tdw^Zw|Nu}E&7|aSTG;z29LRF!tiiuA^$-q7sVLk zB<;E1fd4`j>NYc>sJ|Lwmo;N{lB$Yu1bEnuLaroByL(*z_{Y-5n`+6)S$vk;Y}GEC zeoz0aB?%ATXHA^&aer?xFhN$V;srsbo9Mf-SurPNQCRL7ew4kEjZ}ajkVZbi#as*d zB8zQ;EXnqHc7HcSbu!)sPqcy(X!3R!Ecp~5$>Uf!2B!KQPz;}kU>K4|AsD`V8-DUt znAZI!5g4>2us0frDf`r&%@1-MW9)~nYLUlEv-P{PG*g2;h>^&i7&fLAh zwQ3q8!%HkbBSz}K*_XA8*!Ov&v<^l@;@;q$w$#&?9(>~SIc zQAYI#KH>M}6kfQ<;30LR#fcUyafnuZOqKQ*CocSl6EYp!v*dqfD&HI@H#JJke84mz zCP3vTqf=eH@9epE%e2(5&AV43*jtQk7~M!?Hn1+FIp(MWnUjAkWSyEyYVyrFd8@B` zuZBo5ZGYT*S|JsL>197Yuh1o6o0{hkIz2;+DGGsoQHOiQi4S6J@wRJaP83H+M6P$C zn2u>q+^6m|brMo*vU$ECDOLHM#+j`Afp!MxiR&Fu#*x270CTs-OV`vED@pPci})VV z`cQ#Tkul`~aabx0!bOPx)CZd(mU4FMoPZ8UM3Zyl@fhs-4syrs1L;MLbmG zg*F(2;Jm6x{xBDx+fmZEK6X9fH7#8@I;d=ACW(h#%-P-pp8xL6-)h1F0$)>Lkz1E#8HchZNwE`0M3z17S?uv%h0wW+Eh<+?TnQ&ageEiS(V>_cr=7O3*hMJyV~Eyr}(`YP^*1o~ruc5?DUs^LzAR;)N` z0fqIOba1Y+@EM8L9bKB5XC2qb5IPiuMSp5eXVAg^nCs~GC z=8l@Y8c2r-2#3Rbj&)bPJK{ibI)Bhkq>4c$o*fh)k)1xnL>zw$VR5=*;f>>vtgJBh zw0lVG7F?AgBV#Qgvnt>rR%Rr&p|5lWBh2>S1_RXX#x$TCc5y?fL?LOa=6kWo`EA6~ za3Q}!!sbv7{>Ak#|LAgteeB?rham$27vF;(gp=W`-hYI<&HV*p z@~V#CVwj;P$!W6-`dsmi2Gi2?DSuHMp5FPNgW4%Gh1V@W!UwK<$g` zlF{|Cf4MfNbK)_|e_P(zsG930fD{nBcJb}kZ6pTc> zW~g+Gj>jIcy$SpqzsEF3@TY0I2Zb`l#gcG$oR}oqQ<)-?m?&B=BQ=^@dm*Lj#<=LJ z@ivHYK$aW_XLUygTGaUt#mUe|dGs8IO0|jk`5FB`;dM8!(7pEhX@55R+J8InLg7SwChwYO2_I<<&WJ;^19Mg>>$1Zzr&Fmi0UEWQD|O=} zH1B+Pu7;qgNG-M(M^Zc8m#yOyJTjd0H<~!OD|fonizgwea(@`Yx;d? zJ1M!cIE^#9D?ATKbD-sXIXGqH>cmO=Q*WAn_IjJ8Di?$Uoov3DQJ=yqEaEce!V8M~-IM>nyTr9lh zxj8R_7oxUUv%_%kN>q;UZAP2eR=c*};@$3*zSsCK8I->?VHf9$@$h*vm#YHjv^?=E z9&zNF>HuY4t$!0HBF=;Jy())u+6N``cr-JSG0Y0;C5u*#e5~gsCIa!h7I?}ojJ0zj zN*}rF@WbIsXZcOw2l2EHzq?8A92Ci|FhH>MCl)-Z1YjeDw5aMR0hDOWt-e)vX7*Hy z!~i<9yH??D&(F>LZH55M6S2Ex<;ZMVWF?MFg30wNxPQ!JtW8{|H6rI1L=V?Euj1o6 z#x&R-%NuZBo2w_Ng_s4oGk3r@a2Rh$oW1&yI{CS3#R8`2 zGRUzODRE^qVSQ(>b1oK~`9ON^yhhj+bAFQUx!4%z2_by*rW4h=SK|O~&6}Z9bsQm6 zQ=Wl~CD*Kt%Vt}VIoR)KFHvV}l=1f3^cf4HkgBC+ zM_eDc z=wOloU0&}L>}>X-r!*St!h#%?`C0q5XWbv7Zs-cN*RdXHmtC)a#SZ1xe7)t>0QYY~ z9e-GT6W+ID;9JRx1J75*LD#K{Rikt#g2>h{0K1XpTyL6940IE1dRteBP>LBa$G&TM z)mZJt%-O20KG}!vNF`&^YOL~oXi*0{s(VwXeiV1oT%HfPSWLBZO+Cq_uj13$6jN(V zG?0zx&Z}L2B4X2-Bheh7-Ino|(kAa29Df%D>Z*1jLj}}%5tf7B!)@{6rT(vlnrQv= zEuBF>p}E$7s`b&!E9ds6ck#Zv<6 z*|=rhZmI}N+dZ&Lw3K;TIBG94ICdc0k#nY9<1vST_8(lZM4Ax$o+GoO7$YR45`O_8 z5se|nch8lv4tfcOCNCE{0Pn&(CMt*(7R&Tqt7?3m!`WT+!>M7Y)uC_3v7}v_uH=hG z?~5op$`JAgIya6ks&dyru3tTe^j5}1&f$RwaNQbU;y2WYUGSBv9Sxe)q%&hdRf{=~ zv#jf(Ei({7o`GekiR9mqbtmJXk$-#Wh4`9nak(bF)&u6K2q)}up~8mE-{Mr?srE}_RST*cEf2}UE59bz{PVI7`-gy?}wxqZj0_ofw*;V8K0itzz z*+oJP+yyvZJaBY;04bJvkqJg-gP7wI?y(1=@h8T4GT>-e7-wDlDy;ad%6|&~hCmt( zHyIjBB1XPt0VBp-)Hq%@2`?+Wp9_bn(|y5t7cX8WwzC1fG`G2N?SE{V+0(g0kGU5UL2n%B7BB%==bFePp-dd-D>kOzW zG0n%Q(ca8ZfW&oTQtR0saR|zSB=fteWH=EpaNaLrrGsg+$ zrh|>jH9P267~ug8N1Nv@es!MsmU;3+WXdl(&K5W%3X4QW8O5Ha7=K$pGJ5m51fV%# z`>Y<$0@oFm)@!PFG?Ec4+R0jRPR<@ylICJg8J=SgIwX9dLm^2BZE})4LF5rA(8)7x4#Ne6Y41%K4&oyR%h=7^=e21aMqhvM)81VCXz7z*H=X9+?CR$6{7F7)?- zzaKK^UvPw8ns_k9>HJ{AuDEvBCBSt;tb(Pzc3 z2=Yk}KqS7qOIldnh!ZG%*BHVK`J9@hkN@LiHm_RV*PqTe|9?WB`qMB?4}TfudY<9o z6WWIH;&;_TwVUhqtaOFzbJ<*YAMd)Ya>EC8yCX%4h)J_xOt7WjL2gXvEioJ50~n4w zZ{->w!>k756(lq8&V?0<{HLlwzTS7xb(OE4*U)32QlrqXjSB!?5Hn--+~@g#N8DTJ zLz`TfB$0<>5`RJMZYyj=Mz(<(LAKK3&-*%mbRy0#;8k3UAHv6#-Rgek3RW3rxmcMF2K1zq~;=pbyj7Oeqc=&?D4&clcNnz_>QBXd`ykRj#uK)hDWP|oKB z%M=cx6?kUxEMob9UD!h+a2R{|!h*l$!%#uclb>+VR)22AngM(gf*ou|LTo=g z&F-)DbAQH9T(x|lx0)!hS*l?+0hO@Gmj=dd_ak()$%50Oq;c@O29LyN83 z$_WjS@SzY@pgV$*ZIU*f=GUz99tMLghx7sby;zll4U_K`HWva0oanqg0*j}a!y%^{ zDU^OH7D}sgg|Hj;rR7?5=W~;5=3cGW$qu^kg~CJ~ba5R7BpT1)P}e^7_?lR5A0#~1 z$bUCpGuz^wHQgSY?S;U)UPHIqY~hWa&P{gYL$sftzJJqdqYInp`MM?Px#5W}5%cXh z<$?gT5j{zC=*O`umtD{jLZ%Z9hVI8r)kH+3PRZm3u+790TrCP(z!Sa-#re2wbuW6v zw@5P2l(xG0J0WC{ZDfd6&ZYVUXRcrRIDe*zkB$=__&s!9@Cm*Yg8eg8t}K;DWh04} z@m2jC3z4@Si*X{;NMlGlcDur|IBZB)eDRXX(`oVK`M+L@2R-8=j)_Ld@8Ja=5D8Aw z>cO-(C!I-a{|QU_s`a@w8|oXWX$$F-!Wam?rc+w%ry`&h3K&HodH9H6C>7jZ{eKq_ z(R^@tbeiW@8zL8oO~33MauJSyE>zF`8a&Z#FCuk-Wu7ByF{TTm1T$r*g4*C$Yiq>% z*?G?K!yQYoHi2W(iOg*|swI2e&?@3i5yjh%?E0MH&idyy;-3U@r^z_-goPG&yIfny zaai2_g}U?fw3(#ZBtzapWgKIFcz;KFp#iAK4pC4+f(f<`4HF)fu(kB^ds5`wk>5Un?^H_;j}vR=Yi$5j54OEyuEaE>R8x__)FJVLKc z7Fl`X#oO0@wW^Pb8|hBIOdSY58g647e_r4XCI~#$X?9s60?kzqKfscY{k@N~6L)HB zXrO09)*_EhvSw-uj%t&{MvQVu`*4X@+$%xUl#RJWN&3uQ4dQ7Oz!kiO@xbnxF}GMl zR|9Az>eJqGv3?8zwA#F(ynn>i%4GB4O8>-$mV1w#b+mceQ>32PuU^DHXpdw3-*EBF zq#;cX?=^99@?~4q_}6^To6uEP)*L2(h?0@6v1v-V_7W!7n&G9Co{`rRcZ4#%>+Hm6 z>ZG*1;=cF}MGMqG59OKM)hc9L$5>?@?GHH+DM*Dp9N^i=Woiw@)Kx1fcm<<-ihY(@0dv{>T(7U@n z=bam0`w#Rjk+Yl{EPowHQ6X{Kx&=jrm7)2PHw)c^-im_JsR?yFJ#Stvyrwn;BYf|i zii~uABQGPI+oQGp;l)qO=*L$5`FY zqT~7MS?0XqSJ$!-IizJF3X>QP`O&gXBNKE+3h)LVVU zl-Ryp*5pOjQ>w?8wLdS)UoI=z{+gIvo0m1O&y-Uf8wF?h$Cqh0s3>qrCnpgo!M4d) zGup|8#oQTMF!=(h_KItzT95BAC%Ortg%-Ku%Z2`YsGXwSvxXmZ5AqK)&S_*~NgI-? zo0rDfGJhAcp%=uf9_3|@@(n*uOf{w^q8ex3Ik=am9O^ck{8$7I$LGs}n%G(NxpGZ} zo?MORBJ;{@$sHiSbv7s^6?A8U>Xu7yGqR zyUy7za$twQ^cA~oggI7FX0NJnozwN(iP`maa(|6jNgcy7Yx$g&FEnZ5K^!Pp(g6C^ zIH+eF1-OZa6XvApQ&VS8J_O6tP<*@gfna=NW^x;<$cxwgh`!=_bX$RwDhpq2Ad{2s zx4@hUN>0O{p33Xfb!|8qC-h$<2Xh4lTW3`ufT0AKyZ_8oOmSuXRJvx8ThR4w!c}uh z4u4Klf2OcaQV$e82gh82q><>17&a^5Xv^)`$pY`GLG@HPs!{QX2lX?&DU`1cH z2&S6xqJv#(4MGSxp%kI9?JJ_2GE`eV^y;)$kC&VY`2la#PT4@y^NaOr-HUt(q8Sg7 zIOT-Rt=z0U;y0g~`0*^e&Q@%d(Ehk?U zM1@5G9U7QZn_;RqYMTbT$$!b&VWM%?sqJqB`5y=9Q@vD9q-Q=-VVpscPX^0>RtfXzBbbm>xj1y0kpW zR}ZuH9_MB&ZuGxYj0+0{rp?_#EqwY5EXiWlf5fZ~e1+`)$OY#ej~f46BS_~@Gkdeg zIevT#oU-Jg0YjpeKz}GQXFVg$1TKSa_^l^F*MhEx z>n?U3lH(}y{%{whrqx@-u(XC#MUL23qt+PqS(IQ19m2(x)36gXM-+{{YQ5ITT2r+~ zbh_JBBekxy<6615+`CS+`T&QNg9)PJ7LK(};VARa>=7U8)_;?jA-+gr7;q)HrgR6} z;N2x~$@8>`JzAINAC7K!gm9s`&xiZII5srin%4)uU5qG{u!1Mac+9^2-qHYdeGEjVL08 zh`FFvQf_=99EvHl#w7;rveU7vE}G500nkq9k<&6S_J0cPn%FD9s@cnl%B&jb=|<)2 ztJ7ESo;0?hw4wfU6$v+@G6DD;K#^~W?=u9K9Dk<{{|qZ-Fx~ROo&JL~Q)W~z*(r7N zZ0EH5(;H?-nAY{tk{t_<7T?t0lxfD}^?I%2_DWLhpo=URns$+o)g(RK2tlqQ7&e;< z9FI7Te1FX~>a0TD;_|niepvcF`SA99b4)?YlE+`EfWGdpfh}8MgIoiEVU$===YlP@ zisZS?TH4~CF8+3@*8RzZme`0VCw8v~Z8s>Hvi*8bIDL!fTVte8^_?C&vCH3DgL`2H0lA&`o!RUDh>dZ#+C5*01po5*f=6}6fJ>@KHyOS$D#0lSEt07+ui7Y8@ zQ>BFk0%dT=Wi(zkJ)=Q5b%aJ3Ll(Gx87&ZsCGAGut_xi!yH(gA0ht+MT68t!8-uWJ z)=4BFi6k<$A^&nlr3fB6X62{9J&gUZLAEhurDJGA=*2$tVjB5?iS>t6-pOIRB7~a; zs(;BzNy`d*$@|k|B7^QwOwnXe)bA%uGiG4VpG&sfY&>&`I$fM1>0q$&figVv3s7?` z5|r@sF0-x^y3QLR0e)WE$~=;-#V{%dUQ%94%@9WY|>QoQS%wyGpCiV2J9QoTVMS9=~=hK ziRjA8*>p(dP5|L=Z=K6-`%<1TL$Mcxw z;l{h;nfGH&-G)shcFRPxTRBIU!hd-K)l*YIGIe!zHFNdL43_n?^Z)qAKW2S|bYh;5 z%kpS$Zf;ib&=Y>iy1x$uW`oFBq(*l$t`m%GnUlN3OIKIe0x~0;F*$Nvv45i}wr#Uy z!~05Zh>K|Ixfr`Qnp^aRb|gcUHnUZ~?_0CfUqb3LZDW`A_mH-fr~MZN5qIe~-E;9)o(a4#IJl2U4_SVSQJbV#Ir84W8Ha;0aMgU;`I);%sM zXk#uGj6n*%SJp!|zayHyi+@o1J&7?vjK$_z_s@zrJZ@)9!3xS z4wEg(&lR^o!BgJT?^Ptiv<{1XiRjoZ0zMc=UKWi~y7zpz97Q)0YNoqW=K=vzE)*{- z(fsK|XDPa^W=xN+PRV)TCoBPi&7}oS0tEgfY9*}~uFHYCW{)&?T7US0t1R>qX;Ng0 z=680T?`TyV)_-2%uTx|TDSUp|(qa&o4(Ai}vE8FmPJ z#|A8psC!s=?{taMfdA-1VqcT6)Z1vb>tBi=RGkNhYQp+xWat{jWt}MZT||xUtA+l` zBcc7P7;QA~t*NS|?M0WB#Yr(Al4%m!Y_A+2@hPs-< zGlUpN;(8cNMG(RngrO}sKY`~hAj@Nq$bUBz&ddHni+490J(mr1g3@`gC&w^vLjI8C zgk3jWm7&FINDt$o6J<;!@NLgm>uZBb0z-<>XfZzN&c9)a5!GYz|NVbBWnsWPo&U?? zMQ!4L|F8dhVt*OEcPP0jx$GAFsya5G zpRVEn2!FAh4k6t1@Evh{-n}@}tu!VQ)sQ(}#366DfyzYi<2t`c9hL?g6U0=_OB{ij zIUxOKz9Drh0#Ds();?Y7_81UnEY9`eL=NdOt|$m@htqHkcM5d6#i~uIVvI+#{5IN| zHf`4uN3GpKV~{g2Rem$d&c}6m_TM|ZU*_hfcYnV;nj`j+!pP13w7B<>z+=_HvZ>wanZ1Yiy#U6X z-@2nhk3$s8#6IG7j-yM6vh0&Nw(;(!`j2rQcLX)qe4IJW|7!hlhSY?p0GzGW&SR;& zJ%6Jbox9afR~bGSiFRHGQs>XQU#f?X#c}R;83`W6PTvuG{QNN>hr~xem34M4UA3yG z-2$VEMqP|>>BmiMfD@bGBWBf9AzV2#AH}lrq=g|Y$WjKCIM*7=WPGDixz{c2?s7fC zTE4*(Zc8oF1=k~VcTA7E`GvhDQ)bI@2!CCGrd(d7=wF`Cq=79SH_e%xiwhA}5Zq6R zijVK_>1p5aw0fttb2PD~sal3hcwFTbiUx_NMPJW=CM=}f!%CsFEu1lAG89-LM(0JGgZ>VHow zBw6dI#eRU=<=CP6Bjx(ft3PT(Bhmu?^{K@6-CO4kZcJV7Zi0508)jd_L?&gWh?_ob zC$$0L#h7;OYp+xme+rlxVk#Z8?y=&!rY8G7ZOPG@lmjr;%V$@;l%HC1`rfqqS%lQU z^;u42QRGxc-tCNGNcLcN;xM2PLw_!eXX2;BLvF3;(7ItAYFJ5UN>A*E;N8OmTk?Z& zHTtpm;rZ!*aDII7Ky0%I=jX@A2R!5Zt@KmH+cP;83d{-$tIX9Z#+d^dmpcH+Yy&5K zpN79_1>K56QiN0%6Tm7e?8v>QP?SEcAD(wno^V6<96~?9Qw`8El%4>DF@FOy69f9e zGV7kQP?Xg#b&ad^V0ZVZ+pQW2Dvee~CB0+w#lSu;O>2iA+s&Zotj)t0*DqGAYtzjD zib#cF*lnctloQBqFz9qDj0NzK4Lxu>{0yYx;6iA#!de#tMmt3IViY7;*{Qs`xsu4Y zSs|zVs&7X^n0Bga0SE8{ek{Rq-Ceg- z51J?SOE5ao4NGex-5>NykEX*CAe#sP%PP>vogK2mHcT_SI?X}BA!hoRNdJH2olCP_ z$93Q5_fs5kVmUnLdcPl(V3h^gBC80+F%?o0%_$`D80>Wn$}xUO~5;rO%cBw!7x%Dt>x37`EXCa{*=7dU;qBz zu$qcN<5SREU_LnMZH3L4P+C4a0rQL;UAuh%X}EgZ6WiPRn1AN`f#&P|XB$S@=9pnG zr}@6gbX0yG5wFeof)gsnE;(r+yL=v>USFwv5GeS?{WO}NYc$p^_=e;@fTGr$&_`JQ zncjbtbpIYvFGR#wPhUKTs_7Aoytup!+w`-O$dm2#^!7Km+MR#GFR^N-^!YCqm0X+j1-Mz6O?t%AdY?^6b?MZfWH( zh3Oz~u%j56SmuAYXqcxTezCcc-c!~*g-}-Mbr&;F>l`!hKO^JW7RHNeXE@89_$w6i z+)B*&^0H?Ebj(*fC*UsJec?^$1`IF*_I0qA!tXi!4}a7&eP-%MBHHjvu96Bu;vmRD zq6HQu66b5t#R53GLiIJmsA$*I#o@g5BPeDx^Kb5-KaqPIi)O`()D4>;y};DW zDGM$IOi#C`#0PF@;$<`A_UVIVuC_v;VG~54vv@KH=CKhi4LE6Ra_fg_+C&UvnQr6=1!tS z`OCd)lM0nRc2S-`T8QWjxD+r8pPrc9!kY!BS@dX|efiAyTlO@eo$2`lJbU5QQf3W` zqM!d%aoy)QiL;y=#UIUN{Nlm$pDN;}M)@Wi;(w_uzu;!T!x{-X{q0#>@96U#$dW8F zJg{Q%KeI(-p3%XnE~ac*0TS9%n#IJ+ake~r@GMj#XY2dldlvG)f@t&CC+W%eiU4t0 zm)O$(2FI*z;R{KQ^P7=eKpCHONoSH5{>B>vG?ZE$I6RZY|{is3()M z@qbnR;q?0td2tT+EM#ZrzfI?=xzRW1{io=TZr;*Kv+MKUd;z6>cKVaI-hTErz8xTW z|6hNN0F&q8`s&BhexA*4>o;E2KebzVrginyP9B7%i}@~#rT!7O?QAUckG@yfTxfD&k@J_kN4&6SSZ9}z z=?0~#Fz$2b4KW8}=2pq#{%C&8NZy(M3Vx*TeUJE_^Y+=ZBagJm0kJ~FV?!hdr+aNJgCn< z`PGBw{Fk3TXwOfcJm}7!zvRYxq~XWni9dev{mrmL zUg+-`{*zyws}%m1pX%=!{^ZHIPJiLpo%~(==_lvUn+G3Ln+pC(Nr3kG$@vZ-&!&Gs z;=fPMPXID5hO$xfR{(+VZ=>e#05U`Vt%`hd{sTbJ0ni&yK1m(GH~EwIKKSeR|KgYL zuri;upPYaGm%sSAe&64Ha{l1q-@NzN_Sa81-sFGH&z?NdL6u)T`1p^0zkj|wp6GFY z#mPG#e_XZQxg9wg@^Rk{=b;~2?sF)6{R6r>F^;5oSyaurtg>RCzChzApCc_+SHrn% zsyZq5X&f+sd7$5y?I*6n^tRzjG==X9)6n~y(^q`V%#;88} z8Pl<0>DSxS$fi3%umtR#?w>yZ#UE{lhu(TLwr501S5=4$-gWo*=6b1ad-nRJzb*T- z{c$Pl+xC1Ju9x<9IP+1TFQcB66!Ra=&6VHPxe%=8qdZ@lYpK7~LRH@m`pL2UbJwdz zOO2+vtthSHowuy&U4Lir(rfEOHP1Y&u8j?C;M4l3&Si1)vg|3+Lt_4YRVT%{wqf=q zMH;G>^|ezyhDxp4#!wCIxhkx?rW$Xq-6$|ysAvpa_EadnKvk7D_mAIx^=Di%(6dORj1m&?QNDF|@kd|b&3Lp&G*wp{)f8o*nt=0iIM*Y;T2Neq(7HMILJgQNnQ7T- z1>jO3w5kWdz<;bjTeAFA(N96ks!(lFA!|W{y3j!r?nEK2p}Oe$X+?%6SrJ24uN4t; zAOJ0E8CE1+kh0*)h+50hi_BVEjQI0>|w6GPQEPtYA=1W>Ku(Dw6Rja1Muj{4v zEeRLZL)&4DhQ74{+v>_wjazheF1>%Kva=yE3mb)YPX&<~g!zl!rvF~{e6`S-ec4}K zVSjan6A{-Tvhr_8ML*qTJwq)uwxo))9$U@&wye{|q*o&vBXt)07)0U=((U~NIH zeBssD7?pRRv1;Ynj9Q$rd})7ivdO+)RuE{@R0=WRye-M;?Pk(FKP>*c+_f7+kYG3 zN>pL{!B_`GjBtQJkp$>dTB0$KzC`n-^S{{As?HK;)m=sE&b2t6YFDnq>9aAxL;GC= zB~wO?>T4-LZk(_YTjARYyA6bmYG>+#v^Zg)DkMzXb5}H1ltVfyY)|sILs!tR=6ur? z-B)kKR07&X5kKEbcR^tY8Zwk&m4B5*M|=)ddbwgVIW=~A5_ z1|Gp>Uc#8rHFz-8HCwjTLl&()l~Te_I$pZsSOmB%V@u7Yk!2xQMGCc zl!%^^Wp%oAO?|oyRPP~$OC!!j8CVyey){u%psertPhUL`+V*5$-L>mZSad3UD(mso zKx0(7smk%xb>->4DhK`e%8>9_$c_6XSuhYFjK06dC~tLnTK_9ghQkB+RtzsFE!W}* zgrpyl7W5WQzK)slZPU3e+m_i-Q7<3jA`Xz|V>8 z3I3tO<<^{zJD%TL@~y=r|Kacd+Y~8zRE;I%`EFD8rw^T2-{lePat9DXEC2AH{^#6= zh?=71Fib^xL>oJRsg3{j9ooQO`KX2rZH&bP;}Mm103=EW4E*=G=YOWPvMt(UEXyMr z*#X4S|NZ}%+o+*W6LND=9%&%kfT@iH0~kRnS~m&8yIL!;4Vc>a-^nN1>Kc_(Upb*DlyOxnz4K)~^VG`Zm^N}6~j>I;01AE+cS&E|GE@q|J=MM7U1 z7}-fGv6L^kuc=gW1DvX|gRkD4%I4AnKUM7o`ZhKUP~snLS$~WYc5FPmztpr{@k#41 zOP*6)ztP?WKB0e8hmNhzsT*Lx8dZh9eQJ7&%Vud$BjY_`6Vn`O_X*)WoM7lQQZaT$ z#CRwLoEprY=lbA}`mVc{)kX6dkRW`pyNU~=6&5d`gP{tCO(!Wa^fwnrESz3QzpKZw zuZJ(uu-=7@Uw`o$vZ~?#RtATVOc9|P|4n~sKwR0QVIY}ktvYqGmB8(#J=H465}_{E z*hU&-LD5DFJXlvlrLMlZ`J8UPY(n>iJ^Q>s$~23IkIzmvFXa7>bmL<_)nj#kqxnKR zm;v;D5%EZB4M?ntAqZoPOT?ouh=+x!Sr_Q*R26(^kbkrQ?OY7gq7|)%W>B)aV3of> zzceM(1rdv=hbls`Q4f>@erO%3Ts9{W4;JHT7;Z2C)tRWJWHeeZ6Xv!)Csu_umIOxb zm-4$^><=(*>j(KzNwlv@myY&CKP(^ga}oWR&D&xXN=eX=ML!qOj}!u05&b{|8ox$P z1#KRY4}Vo@uD-iSNv%$r3p+(0qM(x#aDM`83|EM3I!N?1XlDw!tSKemomO#>KO`p~ zN(^Sl3e_SW$QwkkFGe}Mz(U9Npkr2rt#6xAo)0#S_OvXU6T~C$w+h(6o?%?Kyu zG(iQdn(|$Y!+X$bjzZy#P9YBlfqlpFuw77Mz<&d_6{HOZ;saqPF2jam8LLvC%u_t6 z(i+%`MvkK`!m6^EH?FrCQxg6}2ewnRPH+ODz0oM}7qQ>L239UZC_VsJ5EZLzo4jh| z3`ioNA41vdgK8lz~|nF07U%G6`*GnSLyAX7-O{h%DXQ`2Z5xC(9{C{gbtq_~FQ^|fW;CK7{JIitph~>KFlJl{ ziq!0dQZW=+k8l-`ZH&W_e{;HL8`N6NVPwg%1+;KseMhv)d3}j!aG}oQN04SS+|zFN zC4QQFI9!P%gwDXT{5D|BW?myQY8J9WiStqNbs~(dynn$=zu*u2dUa7UA&84k+L4(qUVMBx7f5FyQX4 z?BV3M9U@2cd|R8uQ?(Pug7_5O8%3z;o0Fy`xuSW&1O|Sa+`xNjR40kiw$>h=K!2O* zt@JgXN$hAP<%CxOq$evZ64mMrR>v4A6r&D%FecPAZQ-b?M&TLscxx#D;@_z}6f!KD3 zwQ6Qkd&HVB2(=+t(SZ-=Y$g!sNCv4C88!j(A?>+9{PMD#vSQhyGBS`7y#X!I5OxQujr_AnlT^XBC zTByauE>bG72Z`dcJkTa<<;2xGi*E0Ef=O(uE}RZ>QIB5SHjA04QvT&Cwr_|8?MZPJ zWV(iMM_}`zkAJt4dVC{mx<)EPD7YR!r?I|?>nOSwrAu`z+h`gj)c%~K761PC+$t{= zwZ$o*Nb3TQPHs{7Lw|JhhUFe}R_C**mEw385!AH@1~pM63j)EghQ2rm1sJIIw{&A! z52KKbmeFFP^M#jJ)QwL-3Yo{o416(s;Sc*?otUL! zinY+mDx-WD@*x*#EwYykofd$W2?7OyPCi6K!ha|G1PXu_wbY>U`7_R+ntL2Pl2m~% z3m5FJ7Sq%2K`HXp(SEYuj9gAdF>!erWZWQjQ9%QkKQJc=#9IXg2lTtbH$w1|;3SGV z1|0~thH$M)9n)_#mRe~d-UVgBJW{uyELmUtG-&+IkdVSWb9c)aei>B8uS{&!6$mP07=z$OHNuvQl$$5LChuHE&4+%jKl$1V09spZFq`yvApc&iL zjYNNxz!rWR=B2ur8&w{kilhq&B7)`3Bb+JEky$2IIs7*0L)Zkv6Jaz=LH&XD&@-6n z!I`fxp;V~J%uc*%>hyeV~?`VcTLX5m|*xag$o9!a>o8LY9X3d&nYq&Q`V$9sC7ADZjsuffddBQ6Izw|DpTseV*o6^dI7b=sFrMU*`J* zm*z^8j+F)WCP(>745ddZwnk1m7vwMjU4JYPKYs+;i5b#X@1(Sngc%( zM{W(9<{+wc6;NI%qUUludyE9n67_!-3PCm@+E^%&zxkcSKS7T*&ce$5AR*}P(M#Wf zIPnLn1bxN4a^*dI2kCFSnhl0uMgArT9faWOVShlAw)_&bM&*ilm&3hK@ntRN%RK9- zS(q;Y^wN4q8T?vQ!Oz22!MHR#6dX+t=OMi^DvLLnHY;CtBfbVdDc(h5P2+#XlDNuw zh$%TRIgNuB)Tp#T8Od@U;<%?tnX|p3{|I=rE_9X2#h@L*r%;c7Fis%4%j78wj-_ke z8&R0*SHgJ_!B7q10d|A=%A}^Tm-GSOVl&WgaF&rsg|gVfbX(Ea2&;we%F*=MxyRXI ziF0hDU&u7{tFVOnm8&fF>rQ`0M6#U5?kl5PIm^_od}QiY-*dN_pKLZ{ar)S%)-o%s z!KdvIe#%cKEzY+ax)c%@wA$!e#v@m#lsi^azG-&1yLQwbO+BCakmM<2 zm>xrduT4{87ZAEMZw7yvtW1Dq1x>W(_(KDiwW1V13YeVoe#I;lsfPJ&`+3%A3Ir8U z)fep{sw75ITzyAWf$AAo-w{;^sT5b=5LLmEnxG1TR6hN}mAdI&b)34V%)LnNT2Th|t2G@k`QcLj-*rC?|^4+cNvzET#sHZ$WlUx@HScA03MGClOhS~C`KSW}&!crKt`!Is!2h;fEQMZ_e@ihqwm5_!JJ!MMvm02kv zi}i`S90;xm^Q1$oUi1q&%|xYOFveNu+B+@B`wcUH?hWFXl+X3F)%S0?A9 zdv9^Qym{Hs)#QKw3W^PmEmla#m6t`V(Hu+q1D6*lMcFZ@m*Tx9>gBICrPY1D=3$Wo zQqs(H0*j=Q1R;M0PA^tCp;Xie9tc~e4r9mTeUVn`rJc1}T80`mlf)V?YzgMo$;_^qcy$&?+yUAZv1pN?UQv&zSKgN0*Cv8KqY4 zMCQF5{K95~R-lfotQ#ASNW~K^14MxbwJ!>R7;t~-TaJr60K=eI`rark+i}T;(;6i^ zPHi6VxeC{qgpW|vn-!xCVOx%M;*whPoZ$D9yRkJj*7YP@1NH5Q9~O^D{Haw@h=*f# zFiW-#UX1tQ#WJrf9|FG&CI|G7iUn7`jQ3LZMJl3QWUHyyB*GL|wv6plcrwI;>7X7> zTtj~xmRklg+04e@>EqPis|8LCptUK$sr(J8*eoCJ4-1U*H%c8Y1mze>ts6+A0O=47 z>Vd>7Ry+p`pgcT!pi3u9a9Nk+Uy|i+Mn6)HFzxNos1%6P%-OvpM)=Q&Hn~=6ytSYW zdjSL>RgjacW{04XjisGqhP?bh%1;_@aYTPZza6NO<w zsol5YBXYzkXF(>B{}zJe?G)42H@I{& z`w9P0OVUax#s;$;0V&?fG!+fASTYGcCQWh|`a1l$d8iX0*mTBFb(BxYz}>|+rzbwz>>t*wDCC884_kkusBq{% z9Eso;>NK{&{?K=VK=_X#knary_9G{7LCjF#yfYOX?Z)yL8$_2K#?@Bo2_x0g>Dc7# zazzNtt1d2&r={0fx#_P7XW-8Wh**+g&3}WnFP~nunHea%&k}Ma|Ln|ylKISK^+9dP z>Ks2~nxqL?(BCo-C~2Qi#EpN8PLKWj5eE4`BvL|36uJ2$Z>zis zi3JL;2zQ7SErK>3M`S$g6{Vv(PAg zTuPwQY!w8$Ayroiygl^CVIhMn+GkdmH7XV^oyw5q&%%u4$GfX(Lx)Gq4za@`csKP) zO#Jcr@1A|ewJxtLUWBK-xNGlQiG-5e8OvaRhCE4>{^c1sF&G6z)Fwy`Aw#I{@wF*##Ay`Y1QG^}2@*$c zt79S}JkqY@3w7pw9ZR#$SqmmDT!sDXK{Yfy#ENZNf;?W2&$)jw*SvqI|J(JCqwbsc zFSfMkfBbg+bISUg_J6&5di`yn0S-uxP6aw)1{vu5GQV4WkPLqlj;Q-lOHPkL4mz1k z%V%sKmE&Byj?tmHUts4t+by=vc}<;RbGp!(xACl#jrtIXc9kmmd^aG@@k95r1!tMOBvuk8U3t8d#y${LI z#j$T66Gz4Mh|hm?P?2CbK0Xi1Xy)4u%D1@N&snoJI2l?NgGWjVzLf3Mpz?!suloNt?eF1<@hx;$-Qz@LQ3y zVkH*3kvy<$?UVA7ri5*~2E$UePcdhk4nxki$Kch$l(m0NDL9a&%bTYuCT?@SE@W;K z?H~xmu9G;m118G7_T+8;w(M=bmA=hbm+9NrRr>Z#Y~rO97SoaCRIn`se6yq|4J}soXqZaV(iz==OyoSB*AeHusG=Em-TN z`*+iYdE|fBH1aWGf~dYhkTS+7&B6MegOtHzdh?GIqeNT6==sNqQCf&n+xP#;808{F ziKnLS&8;(-AJ#L~!n%hCGy8bSHT$pfMCz8K&Ys69E~JYO?3A-k{Ue@q>RHE)M+p-} z5p{gtp87=zFeuCDJaf_>+W=Vu!x|Ai6-S~}!tQ^x(GPtf4{^!p9vmug2@ZeL2fg@z zmfb1dYBxC0HfO)dxA9AIBm?b)wOG2HnBfWmJ4>%LPYEBtOF-80I3N7`Gh_;AZW;6oJgzZuN=3*j6oBP>QR8LQeP*vkOVFmeD5Hss1ns4tC2+3{KoN zAP;{i%nK3r-&y`yhQ(DJRKiTv{X7=3N`8ckTkQ4)zfs3->yZXwNA3OAiEz&f|6!Rb zjD<8ZWTJ=YP6aPDjz}khS>PaE&m*~|%>*`eY+YyT{n)!Lgx$Ci?@p1#Qk!}_obBe6 zv#$V${yDUc@HOuzC*seFZk-j~;#9odVAFq^kBI9LL;*a-T1KHcdvwcL(S&hBU*vHO z_Qix)TZ<#W<8wstBsFJ>|L3LmjNEU&CB5&FyXAB5(mP4ck={k_fwc6?X>0lg>Qib@ zQoywl{20+U0tNi9$lfWQs}HCZDL(ANX%ogD{~{Z^H;Aq6n}rO zRE^&zr^4l#?{RW{h)92z+iXrBsz0pD5A8Ac4RC|V=}Y_LPG_^fM4IYSC>`0JYv+~h z#)4RBK7ajU(YczNsuOq85OfNBqicnu1g46@S82th#!kPwWDwG^y1m*j+9OeeB|nxYy08^L5R(3tN`q4!L z$H(GFkk}nTB}P-cV6JW?lem8|uf-#aEadW@r*999Ul*vGjnE7A!KqC&10}F)tJHq& z5V}j-=&CW@2qzRh@~5@C)9ssV7RqpuSsUx zg=m8ghuTGBx))4a)pZ_%H#bsU3@u}2Xc5$|b+a*X1Dvkl6e8^AOkK8hmQ`xknSli4 ze~&Xohh8N8T?qJfg~Dg~)#E_33o_!`CJ~9M*`KzHgGI+I?~MgklCf;b+9i%9*lJkg`> zPXoWY=?a}jjw9O-O)EEyWZ#drsqU4e(; z>34MbHuYVK1IqBkK9y^(^hINFt%&bdBVRTua~x)xdfI>JBx|2nVeo>6hiWN#^C9=Q z$9Lt~yGmnMdRx@1=X#4SH-*pl3oqVP`;+kJU%mL$(y@5qFTMVN*Y2mYh}RK6<=SN4 zE6rs^_Gf=nsjL$YT7tmva(dB_YCd_v8!rFi(Fd2mdCFzfpQ5n*v98|vSo>4|zpc3G zyKD78*Bo$@s6FML&8h;>EdeMRT@Rq&t1;{W#+|16Vg-O*aanj@uA1V10{|QU7NA}N zYW)Pzyz?<3NvW18l&SaYsfV`qrrxinJ~mCGruKiP-ZS-$-a|*j-uwsVe^70W1n+HF zA))1e9RQH9Iiq23`G*w_B4CLF4R>$)!)oONU|i4K2gKb1J98k*WkI_fI`XpIYkXTE z-POLH`tYJB8vGjj8`V@RkNnr=-K zKBa-2kG*B1l|4%0xZN%-6|2739@-y(_9?Jj)gog`TbB%5`no6wWgKX$W8l;71xS;7 zK#U9laV?=`-jW+KI6_6N9D6%}LHDF()wq8SNh9ON-UI|rc0i(z4G_6hzMa|DuPfj? zWnMpfS~(RhpVHVoeh5UQpwBSozdjuYYPAtcR~s|khd0dn5YDhazC8~zVzkwP+dS6wT2awKy>^?345X>=fDWj=wJo#*W3~2RQVsyQe6p_@WJuWt zG4Cz96-!RwR#eS^0?0NxbW;8%1iD+ctR@ zYXR5Lj^ILVq&P0rSfFOGt$zS7onn{mi(-dT>rN~mx6&;*_KwhxyAaWpci5{GbGm3_>LaUQ{qj~!sv z_}CHLFbC7O`RQu`H{>0_Z9@eA;m7bcFpQlIi5h$i?*gCRg$H$hweZn!SObCr zxdzPUSPqC*d&1O%-isXClVFIDB?#GK!UaygXE?)Q_d&a6D$HpYcj?^tOjzrr4%NIblTh)d74gT+h#}N*~rJBkDkqaZKyC3K$bTDqvjnpn&<=4wjot)YlcTS)ijs z)oVto0yc}i0vy7$E_U54DsbpWQ32CE_7o)*@GZL&8c=@$C;OenQUSND4+;p?ycvny zY)p53EejkSApg+*_E!PldfNrQKEL(w z)B9(?dGYD*?w(!WC@Kqo)id!;JW~CLIhllJ1w|SZ4LIYRGqUiu%vWUe>V=T>!kg2S(c#*!u zpXb3`AbOAEJRmZfr1F(^oV#MvA-$K2E9k2ttRNY|Q%M%qjUs zQzSKncT?ozt25LVGMS|DLwZRBIO0Dbmy6HTQHp=u3CaN_2A#x$eFVI@PrK6bI#flY zA~{|i^$}?!A~6XK2h^J`8CvP!OET|Pc-2?}!a__V#VJ4HFv!PGqQwE5LU))a*hr(s zR)7~WTK=8T-N_8d(da85M_55|Nf0@igvtEMJ%amF9OWrZ0?PrHgm9)oLZa%zOr4y# za%6w;Q~}l;pByZoHw4c6o~Bj-X7(#dB)=uy;%Kit9d)0i&H>!UbcU}#$EjYXhG=#VNmNm-&j*>MXXyqFK zW5IGTY(m`;x(9Vn=pJ;npb=m;$!@i%Hm|CA{BW<@3MIURsenpM){>S+GUf(Ua#S%6 zHl$)4MBTFWiC(>VFa$fK`$v(J5G4nWXJIox$Ky37c|bFOeciIsQLrR6YOVvB#Yuls zqccw6V%2AqwFk-0$w0}-2HYAhF^GkrRpijBV4{;_amG4~OP!gCwCapk^ViBeTA=|3 zVktf=JRnqxL>f#0Zu8)F3JqZc5p3w0|0jT|8|B$FL^wrcP?k9|d{~9qSgX3x$(jI& znaq4X19XxxD|%+G*3=pa+#Di_uqJ;7Vrgn>3kimv$>>k$3i~i{?v1(DyzJHa^%hAK z#t%Kzv|=WK)r@An>_S(wZn%I=zJ(C_O?M)bP;$rI4kQBP$}?(wGDn)Td;Wo2Gq=#^J1%Er@Da~1XqLL`64tee#a zpJum02yg%OIjk_!qllzB*Ts-0%*0!0feb3^e>5+{p7H~R5AjG0tDJ7MG&d9 zGZje0Wk%>Yj=g{<+-#6~6AG+J)0auGK%D6ka@&u?Hna)M2x)qH2k+&_&+k7w)bZrL z@;3D}a+2vO>`C&>oF9{yZGz&Ql*($Jk7#eiv=u4j=J8knVx73-rpA9{j7iCiqGTSK zI*ux%jK5}s%8Y;1Y>2sQPnYsp${8GpJoX>jaT`8cfh@^M)1<>SCwmyg4qTRskyyL=p! zy#!m16j$~_gl`2kuS$P07{%Zjj|dG56jVVuCB+_QUw8n!V`;DQisV~)m;kJ(KGui^ zA$nT?s8Hgsm;q!PsHtcH&H%a)AliL1fY7y?ik9;Pz;CA-lOJLRa5YVuN}70NfUu+~iG2+Q+U;dq1j->=T8TT_L`v|HF z6`!F&Ur083i{pPi(I#i9;dqYPWKtsrM`F^|vdW}QCUrP6XWCmd2b@3S^fwO>3W!Xn zpiQQHWnL>AziE^^oknGH$&O5!+z@PhATDiUCQyHh44H~qV{%Xtc!iw!ZgDYUWyd}c%SbHX0oQ>lhD-kqwSpI^n z$i>?TJUtmc=7%Zvllh*s_`xM8e0FOU^kG*KqgRG<}@1 zRFvyaF6w^_EL1OMCkH2xn2kTrFDyGA%)=f%%go)R_;F(p$<@xVpV#UIS7olI$UDIu zwAz=Uuhuib*D8xMfq0T;xpsZnw45z4E;>JVY_#%=XV15}GH2(vpDSr(^ZA|z&C9lT zXeJ)|S+%vc6I>R=F|BlgDaR)f`Oe-fO=lqs9l?K=UbBG6Y7k`ATA+VRnOUh8xa@Zn zFpB_h_a3dDjXG;RFV@*p^e{kEUcHc^K+_F9L7_r?Hs31P>;bW#@$fXf@CBT;nozqVaCFAhASBpX)30gO>WyIM*y^% ze~^DyQq(rLC`td79ERM_yw@tu=_M%WTJf~j%HEh)i>Ti=hHRSQK-XdY9Sn#1mVAF$ zf78U|d3ItawwqSWjvlQGbW+id3uLL78wIp`4oA;%gx8gj(f^tf+csfRLWb*WN+b+^ zgAO_wv#)7k6Pl#;a~2bMjCtH{FYz}NceRMp~)8%`UJ z46Q9pXW@XhP3*vwfqN`%7w)dUJ*f`-&_M%yQwa;Fk+z;2g@u5$6xDddDS&NZ3od_Q zS&W(FU{Ai?1KUXe?nWT|us~A~Hfvo1&{FB1Y2ff+lxK3R)Y5QnX=Cd0b>+-?T7`cQ z3f%;S?Arnn1EQzyl8T@C?1H~ZFw+WcLaAq>PZ>)wSYUD--Y{e+vT>IDFa9EYYK$pcssr? z;C>8h7aoqf?!&h?_6+y6+nr%J>bidm-zP54ZXZ_-r%dEJKj1RZmt!O~%m z;D;8IlE_~JKeT@ctgJMeC@?r*x)0iOy8~L84H3juzDIn}(lfWfapPKg<~|o5LmFGc z!Qyd^kiWJ8u?YJ%WW^67B&cJaBO@>9QV3g~_)!T@klxaI#1uSD`bgq@@n|7BXJ^?MY z;0a{s7*}Q@J~*Go9@9{PLx!fz$W7N^k%ivgdqsNW^|b*wy`sib(Re4dXf+GODY?r| zJEPZ`&9aWhq57eD#Kd=bGxC4Po8J6myK;i3d_a8NQ*6by)IrJ$wAdsRn#04OX&?^E zj$^LD6tiGhvimS^c`JAdoI-NYYslbhr3l2x92G=X{x#JOqsqc_THxr0acJuE5i_#& zCIQz&dm%=*u9lF_JefGCG*ky$a5vKgO(S+IXco9zL(}-(8k(MB+bMsNhV<6ZG_tpb zru4H@Bn|ehp=r!-4b9!X+>($|&pQ6MDzXvgn+ZG+L*_xy{xP3p96}b-!%bLm6ga87 z5!Z7-9_ks*#cipl${WifJ)~5JHa+Ph3-D1@=h+2SG;a~*(`bpZcnX_E$x_Fdwwp&b zFkbA&rPLZzj)V>=%IG6gpn2;?2H@{f)F-1`_ zS&ksc<+%|FBo%oP37kfQbYm_8XGmHOHjju+801t?NF-Y?4X697&G_KDeugLRR^q6a zL0v`35rtFVe0E%HVs&#wP3vRB&fW8NQzJBsAkGU&C{1C>G>k~U9(&|ih4xMwG!&wJV?|)Tu{zuc$+lYs5JwD`6rP5N zfRdw>LUWm<*sno#>~IlG)a%7O^*TWFh@yB$sO(MWVMWQitZsH_*uGFH+#ay zmt-w0_)R{sIlg~L>YK=`wtG4vyITYqy$tuqWJOYyKIFuw{b#6~W{;L;v7#*GI1exA z0(snV1_$cTgOAC&gvqUAkQ-%Qi@_`wnM{x~ddddO0+QkIkY&b&5MmmYWCpTCOq;SG z={D5`Woi+fT%k6PPEO!nXD^n~NqB>wxx(oYf|u(MWt@LF=e)EwS%5N_9+rF-8py+o zD@nH%Wz2~|2R70QYgf1qvMR#95g80f$RdN2{4%RWSlEdFB@Ebwt_1;?S}W%_eJzwG zmOCB*?k!2n@ zT!HiW;Ve>v8nG2bbY7JQ5m!XE8A%NB!dw!!D22&B&clgI)g%uq&iY%RjXbspj)Uxo zgUSMoh%Sz7XSMT@t!n2PMo}sC%ByhWLgKChjsbtJ1SKMatAOK*0v1un4R{uMl;HA} z4(xRie_YzK(O78|w zCqJ!=_S|~4g6QM&pe|OW3o0a3mIoWoSn?i+kn_4coQq}^j(u}QoX#F)=N*}~t5IAO zX_jSV(BZ(OyLogz!A#5SBrbnxSgvD*`Q(RiCm#?(-9%t^W)xK+2ww4~{NU|8S6$<= zI&*70&n@~KMR7RDnk;8_d$9RQ+X;aoY>^`=E=@pTZYFOxke^w`9qBWI>!)V2H(v(Z zkb`)B$?oO~g_X}w*Dj}(AFSnd<<7?=G)Gv3^7gqb$2MQTSU0Dta8iFiLi1C!(+cu_ zb%So2$-etCGZ}X26(evVmbh^>*>|&#2ntig0K<6}Ep%6PX^88Eh-Tc!Auy&hZw<7F2-AK3=Yell^2L2XI`FIJ3+? z-YfGu>_zj~Ve{nB-VB_}bWY{F^TM@>$vj5u)s+aS@Xq0)iMvB@$P{Zl6H_^2qjkK1 zLGYe4n_=$%F!v?QaUIvWe+9frNWi!N0TA2<98n}iQIsW%BD8IM5wh zDFFLDg%b|C=&|x|x`jx-^@*C5Z%Oxv*Pz_#Td>Nv!m@_+CzNjupKZahe5;U9zBMm6 z2CU>?CmVsATox*&5i65zoZoRcpES)a{hmp;%)h?FQldvZ>9>`PN?;I`gG`tLMJG15*LkJNV%WMaYXS?RPCI?H}%z zom95L3b?u|gR`;}aDBLaQI7>&Ur37u+z=iLxMm@UazrTK1|n#6yaFToK8`02yE@fd zWo)RKd=`IpU7970G}9)UrbMJGqHcj=6HV<_WTL4~pCT1ClOC98mM%PKDUdRq?^UTn zur;l_hoG^cbqzG*5|Q?7(mXS1oS8JuOd4h;%`%flnMsq(q(NrV95ZQ*8Hkzg@M96v zg^DR=+UI6M^z^{uT{K-ZTGOoLk}|_IC1OF;eXD;hQ?YKyqSAL}lnCcj0)!9o1W%!= z(AaYyW=jB$R*s$$RVBuPo}_8}>PeXDqePAomvS0M3>@Sq9&-@N#}O*2g=o<1kUWqk z%ML{&4g>gvsEB39WB{KMAXcWH#Fz}=6V8$+C)y4Nj+Y2Y7UX+fQV`Mq*yB=JU*NFH zhT4De1VY!LV`O=$+a$KUOxRtTyeYCBNrP5#Hjo5uf!OXxaO#PFZnT<&y`_mkYR7XH(w3z2^r@WeLv-1H?jzZ?FYqMZ@b4WGrg`$+bl zFRyw1Pl;+_<({bKt)YQ)M(=u4%ZcS`R`*4aKnE>d=xp_p2x8tK&SZ^|MwEsLhD zF+=gw9dSbP6i&evJ8d-yGt5xREO~*Hc&S0zJn_6eN5JX5BM(Tik8=R|*vHNp4ICV5IX zD(p1P>M(Yi>9T420$WxsG}F=HLf+q$O4w$Wf!Tc9Ou1rFAFjvY%`$DD8N_vq?rR+D z%#gYp>&!szw&BKvOQ?dfWoCcnsjilp)8GlFEoGPKSY?W$7$^(AWD+8j?72X( z$V>wzG86C-pv72YMjS+Gjp@aG{8VsMMRHd4ytMU$gzhN+bkd(7P@vSEMMpoBv^psZSH zR!6oHE6r-SSZVqjh@%dPb!Gw}R+$xXYt$yQ3Mv+vmC)2?E}>JSIv8urDKWk(7nxdP zRu>g(%qj558q-!CTVs}>>C{K8F{i<&N=RyrISoEl=1Ey&+K^PSHD(E_x?xXQV|u~I z$Q!J4VJ|V1)J!u;9F2dGW&|wbUuiVZ6j6qUIW~Ys(c}D-C3bW$2aywpL{aMY@BD<9)Pz?c>ngPpmiC)>kki!4 zBGsN#tBO3H>ruMY!<8sQ$rHG5qKK#Hld8(hRx*K!qH#AAa@(nz$+;{0 z&GpR_?eu@aJ-zXUpR5`-JE3`^oj%wOkWcy8)hJ9RD5Or}=?m%}q`{JKHF}IfU8Y=( z(&cp8)u=kTOej3!xkz-^V1-FhDU$w15l%_8mR8BBc*^0ZcscV80@Y_eg>p!l0>Z*Y zG~VcaJJNGRXbsV5g3-LhrSqWBk`uRVp!6xA*c;uo%X?SO_ke(j^Ks6)-b4XL>^hLFlL zlW;={_qQ~Llp9i8IoN9VY0MaoVf1?2BM(#IBUWf2Wp_1VkJ3f{V|rP1h!hM;Sx@Zo zTrGcMJF9IS;z@Idd;_VFlm`}4taOLQ|FS zF)=4_T6B@*IntUZt4S3Jd0c~wDz|~-Y%H&uN!CQy*fQ$Tqr6&FTAhe;n7@>3O4)yq z70zW5YL3oW5s5?0X$vQH%PZ>^b^zgyc&XWVJ|${EURaSNEnb51Gc@U{6fBl#l%)Om z3)XSA#;Z@FC6GiN&Ga#Z*W~Dt9J4x6?JB!1G@`@7VuPo;VcPQ&jmY39Mp_Q83SR}J zLh)mYGG9gZDviE$_9&#H#(oQR<3N9mmngcfiG&D6<~W`r9tYTNO0tl(ZdE)*d69sG zPiK4(Pmym+Y%O3@W3w&pqlsjP0r5k9GNvCMz?sqaEs`Lsl)(+@0y$K#Bj*F8fNmZo zkAe7vR$NdoC%26nMG;}N2qQrhMa76Vw2A{Rj-seIG1`KGA~Riy+3cj7q5moCFXJIWLp=M|Upo0VI;HqQ^-$_m&v*y}DY`oW{bP0^ zR<~N=8=F`jSzW=|5~<>ub+mtaY=vV-1wx5ki4sjSRLZlZ!TptSwE}Aqo*S(P0i#6B zNa*Mx^ertY<6jPvS6W#QfkRzoJ-m_|$5}u-3rU_$7J+Gnhb8-LXwwT7@^8p95)h`b3lIx{Y*$yG`U>Hl?#Gh z2M_(spc}EunvEj(somUgq``7tLP9{K$}(!T!AHfB6bM-^rg$78=z2m{NHn3Y%2l|c zmN6nEW3ZB>;n;NKt*9-y!F1#;HTeXiQ4Sgb$+-r9*?Lq@RoHyNE9mkGiv1i)Ujb&o zexBf0wv-6Q4H|!)HVFh`<*;B_xW8lS63Af|dx-hRo>LRTSeb{zPg{{J&}#&r!>5)n zl%;2>6wShg0K+{`JPXz&jD_jI31zgPm19si%l6z{z{ z458?@BrV+1CT?UM!Z9@yVF+nJ$uNYXRvd>R6z%so455Ff2R3dt3L$lxNOkF2R#d;l z$c=5)DC%*b)okcBQ4tSm#>g!$5veIUR&FkeOEWh=)!4ZOsj+&iW9b&8#_G$CcpRk0 z1k;g?gVcojTxsv-phz$wDu9C77%Dbxc-tU{4y(9~Gs+b*Es5}fSc%E-f#mGg2_F~( zV%1j#BprWulL=g`C?YINtR@1sl+0?2hUo~6xz*TfPT~(hEm-)%p91F5B2~z3Eh{{eI#-m7Jzh-KVo&*J5uGGH@RMo7V1zZi}w$L0*(h>S5(A`7-e zsR(}~K3KW309VJvde;iM&g4NGhx@6@gr{()WFI?;`?8{KqA{0-I}_swvjGU2J7Q`U zVAV(u82o#j24gVu+C{5)fTK35ZgS z1Vk520-_5h0kJN}B8YB@1Vnd40^)ybm#iR6ROD)d4HMECj7JSRP@RRFa+-l=Pl;>c zl;;e(wp3Ls;S*vp!~lG56b_q2{XwCD$8p@pJMrUuAJeu;>JNE{^xdMGvKsW&6HUmJ z!*~%C*}-$CQ?@I;@`M!h86J`x*B_pwQr2utZUYy}T2_(h$t+I0Ds9Aqi^_jlKGSY0 z<(qv^c8NP1kPL%Db)4Zl+p?3RvH-I)w4ADn2Y52YaSS+q9Lw++LRfs)903-MBM=Wy zc2FAR&3Cngn|sedDFqCk@xn&6sK20KR>rC$<|5o*PJqRIJdA;4+bw-P_KWhVVHCrP`K9<4o(=B=oC@kzA})&0XbxqN-rMl#!p?S zsRKmc6SSJLSNQY5*)Efw^ygtb)kC#4>CXdahE>+p#&uc1y)%1ehyDs<(nMHy1tXJZ zS$ubsC+paOV>x;w0|uU$b@aA{Ad0y5Pno1J#zENqz>78vF9fv}@eO~B7MLlW&~yZO z@bmaeLGc#h6oN4;qMeN9ByfJ{_Rkm9oWos~c|u+bpKrtSEbvbe-NN}o8wENOQXD#|NFTz! z_*R}Bp~ng{Qb+|Uw-x2Hn_t@Ju%gld!P7%J@?v4OO?k*fZvB51mT>4F$|9k^!x$d; zJA^`!ncc|QVJUuj!riBGX_yl1evB>+6Y|@S-=*PWh-nFjhSMdq(4k>zRU67>p+iGI z(UrzRhlYNlD}#kPkba^od4=kaexfUHg$@l%yi`b8o;yQV!1|012|txYyi z5+alabcv-Wy)%D}1t+)h#KTTPT=JX~MI%oMNv@Zm$CHm#e=8J?ygtRxIthWu3uPY2 zLSCV0=qU*(<(JSYk&t^75Q{w4Ct6bGvCx#z;~BgMPdcs`m3Yyq@=zGdI9__SH$q3W zNq8f4b-H7zMjv5ff#x8Ogefs*nnyzVJV1gu$RlA&coBaTRmv>_PKhl3i&gxflM+kaTO8C=*MnDXAds)-EtnObG-(R=0M< z{5OZ~!FU?yuUr#^P>SIoW_t8!Vc8}UPZ8FEFT_qm04(_?GUtFvnax^O zf2*q{<3vSZzNNt!anocuCxRfzmt50AbCZ}3854g&uw2!gR)W$hJI}E+)&2?E)1~n` zl{vkv0em3g(TZokrfd%Ff}zN99M_MQi2{h!g@qqLkEkCO5oMha$H|bsLq85=7zMG#y<$Sp1wVyPVCLcn{l35p&6 z3Jd^jVAImcGxtE0Kj2l34gM?wR-mmYFND}Y3Cd3^yZuJ7JZ)jKhK`WO6^6g zWGS%%MASY2Ap{$ffK(~-tX*A-j{9^e@xOl?7Bh#daC}KvbP$?Th)I&iI2$^jfS8S! zfar3IAeavF(R2EAu|SMH`sVpSVqcHT0`nPX_U4#`u%{glVrpaGfJq461DJPa1AI3+zc%Bpgc2LE|of~YGJ+WVW%*9~qGf>vl64d{BchG;H z6hBW!enS>gW~R9Ml^JNn(=xFB{nSR_t|>;0@x}+gOw)kk0RUV;qra*Gn3xO$6fNy= z`D=&7eN+{YJxEmoQ+rW>;7Q0;VYo&O3#iVB(q)Z5qADn8Tw?jKzPW%!rmF;tYwb#K z_52XDat@Rlt{g}WVdsO_Z3<-W-CZy_tb!4LIzkvc>siS*gqc|&L{*g|ELD^fnivAj z{>bW)W1Is2dK4V*#%_e3p>do=50laMZ`Wnj87wGp8kQU7(_D&?30rY=qaCsAv_LO-H^ zwW%>Agn(2+nSxY9ldIlDE)XgaEj6pe%4DJvEOa6o73sh#9ide|4NjC|8hqN67-CTY zS8{ReKym?_E(~HW2gsqB#LNKE4u=XfX zizMVgQek#yNHA=AKsL&BpLhhjJRG2ZUE$H-gwxX%UJ2zDUJXs+TX{%b;Ysx9@=m~X z?W`WOc0m2Kuyyw;aW!c1R0D|>41t!A((;Lm8oNF8VoFisORI|V3W}lRfQRlcX%$sN zD_0VnH5!M1;*3pKQk1SMDNdJ_6sF5ciic{JQruN@Av-xAH`LL_#lpe=8lPPB&+yFK$$VAE=&}A z3Yh$tR@cpkO?6=_BI#9~@idBO^%R?Bp-enTqu8ua3(|{0W4mCJX_AvIu?x(~me>hq zx+Q*SC5KZGOmLIvi(E>Es`rOq=3Gr7s)k(?f@#N@`F>;x3A;H-szL(hl~MCP-i znA%(VC8l7&yu*h>O=G~k#fQmy`x++e9ebFZo^kUgA10?~+`P+&$$E1fCdUq#I9a@g zGm=A?L|!ni51f~N2FkNN?rEtc)UC}u5d+0#bw$ba;LWRkn4BKGdC?D((}Op!L}7Az zip@)Yn4F$s^NJrPr$=djUhu=@^eD|cR+yX~rFpv#lhdO#@AhFbKitWJh+yfVnm79} zIXzVKULPi>hicyG!{qc(&GUSaY+mNWldkYDR7fw6xPNsj`mX}XEu03Q@ydV0ovCqiy|#mG}}04)ga>V zfSNvKoeQ^H33Q@5#K82TXC60|?g?0uR6!T@bziT&9cCtuZ>J|HUwNm|VqWm!MaTG1 z*K`w-PBdZ5^a|2{Zr;wLbEAS>n-}|t=vSpzoJ$o*vTylOdPRQo-W;W;$-aHVkJ2m1 zmwCUB(ktq(;EV0ceUx5tgxKfz7*JJ0+1Pk!;?qdH&aBV^ETQ}R`IWu;-BG8}8>_yT z{M*b&gGRH^#f@L%VKHv*l)ra|?cPoM^UgCxdwYkfSf!wUwC2xderwWbK|ta-9`u)0 zXo@@IVPoB^zc=Z!^m`J|tGJ7REJl4mDnDJ48a#Z&&>u*gNTtm$Dle!N{?~9%(kgm= zse^n6k+A^zYe6Lp;A*h*$RL~8a`a*}8Bq3B((Z%@H?0P#oR@-7$S^dODJ?M`u#Tp; zVNTs*)Y5Z*7MWOZrCSmff}V*fqZEE>*dI`>lmhopjMi5$NJh%Y9dfF;p1O9 z@jsD&1v&roS8LgMeEgbx{CX{W8XsR5*f(m~MSQ$h%f5(@Z`QJ7`1p-l_9c9LOFn*6 zV8125zFo_n$H#YS*)#Y+>o4HreS!T>ExUq`->qdA@bP=K>?M5sel6R>#~(;e^z#}% z{-~C{jF0cuvK@SUua>=vkMGOJ2hz_UNX|chmS2A&J^NEh{Igni6(1Na2K}FZQOmaQ z@t5-JuWH#4AAc>){!J|_@bR~`tc#DolV5*d%X;|u2l@C%`S>UK_-D!aFY-$f`EUH| z<2P&B03SbB%bNK3d4_%b3*6AhZ*eC+#^l_=$J;oEy(B&E@SnSG>{NT%%UnhYxpuHkLNqy_wGC!-r>)|MCVUY*rSV%c)RNuh+9N zNNaZZ@C&=My*U&%c}|{0ruH02_+;3-+bRaw3~khz?+D#!Z(vup*|;9xiHzMWcJU2c zVwZPLdjs2GpZY(%RKM$BXm@qCGk2E&W9oh9iTd%gl|B5W%J*e(=pTM@Pw%YOH|XS{%jXT82K&nj z{zAbQ2)!N&3>mp^irrMFM$>bJ(GBegB`Gc0R66)M$0!! zc)HPTclM&>S9v8x$?H3ff!`2y=Xo4U?Lntq_&x15?nM3SwtL=b-~qovO*}7K&u@hB zGL}ylF=P33FoR6^xOeY`{^I#_>lfEAU%wcjX>7mVyJyS!SUYrRY`#0F8}rA#ds!Bt zPeAg+&!C8u4D!RzASe}oIDap}0d9MjH*7N2l8@xE`Bs5vz=NbpobLlYMXyDxQnvVA z#(S-tik!Njzqv_8s3e(RInRxx$uucWCCMO|sj5W>{}e=)iSnC4g2ECczZt~XDV2aj%31@^(g*$7yULT*O zpV9+Gyur7ixqW%oN9%ZQ-_(3z*ys+5mNekk?=@{3Tx?Q_**V)aR2cP)))P8*_^`im zruMaLf%4DJ=@N042B(zExkk&GNTTV=`NpWQn8J=RyVI$ZkKCdTjCkAIBHrzO5qIAy zq`=3Xgu?G;`we`51H$E`9PPFtOu1NYx909Pk#T0V#vjLVa2{LR8WxS4Yn*UG(oZ6N zS<{ujN6HCv`h5ySqkrhd&IQ_M;e?tRbU$j=E3*HBWn=Q1t$JulvlCxbRk3Vg&G-^h zDYII3C3bcr zlgwG34U17RUZ7Mkhe8qwNWV<>&DzH1{+3-kriFOtp$%u9R<>ekuQueA;ieq29yH=o z{CoFg)q1A@m_S?o++2K2>~9xiS!aI5x@68~xDM#yzF}DO#+gI@_xGJn)=%(e?q93l z9gJ(sdAD1CgHGPbqbu;-n>WiJdAo=2P_4+f1~n+VA4Sx%e9)}n^|RH&-(#na;s*3& z-bB(WLb%GWUaN+{Ek_XuA@2czXedW<+VWA{r9ty-aL3fPpKq_$?-qBk)boNC zeBPbVvd=Hj^RPi$4xW_OaIuCRiV zQRgDuhc@*AH45z-y~O)&?ra-{-DAY$5W+K813WLA!w}J2J7`X?k`mo3x~cpmIY{PG zGJA1=An+EiLw)X|p=i?Q5ANi?8?5 zQZ|#J;-}qpl^1!B>V%)~$YX z(gi61>rphu#l=oRzW`rDS!(o4jc!qY8vLfyjk$pLvduwYw=E~RF@Z5pW^*!gW-4rd z+X_q6O0DJZu7M^bmZ%)9o+r@x7&m84zs3)?#Fn0>)UiI%3h_c|8NNyHPjhy{0y;Lx@ z`iQwJhZbX=fSPfm-9xG~L=vTc%w98jpLKhafsN^bW|xy-vb~(jwl6brareJ_CO$ph zix}%TGRz_5p+4Ig7B_H30dD!fVMZE{F&&gg=i=&WJQwD+ZP@fDq2r_VzZUzpNxiY5 z-auS-8{Wve{b4IJb0FvVXT~PWs#T|P*6wbDyg_)OZfJkN+))QjPw}&VJg=o|mau`h zd(1v={^u6r& zr{yk!!wkn#aG3SngZan(aM}PL{Pagg8=22lkY<0YUCG5a&;?=z+ZV!AORBOkK z9@s#&eDK)?gvV;QN{BGq`YZ$ukA+8}Ib4B7RO`?(5u7tzaRoz;Ot^V)xpMk7g)!&M z9&aAb9hmM+re$G&$X8@aqa+K-twf`KEF66~x+!eEh)e;==GAEr1Yn$bX{rnC3iH5( zRZ5wwjm&xfY%8RMpM~!-dDC(@7SP@OaHC~A{Km`+1GCxj#Auu|T)Y^-rR!=krv1_Nr zllB2pYda~mCJu27cIHmE)5AKSg0UkB&f@fu@DqPh6I)+@DA zvN(Q2XM+WQM~XClj=5SQOQCO@W&lud@K#T{ts`eB|4b{*?`d{04wZStqD$=mu-j zgbkg0^|2blv3BN}akukSze~+MQQaP)`zgo^TX-f}&YPY7sK95t*TVzrat_o|@_;&p z&tcK2E$`>ABY)OtHnFxA#{HW`FQfMtYye1f_?GntWiqLaqCI@0)0i{t=eIibyOh61 z>@t#n@#*Lgy(CIjB7n@@z17+1Y~pyR>nmZtJ?u{gwF12vbVz5kx1Wy=A0}2ovV!R{ zWn_H2#LIx<ye^jym6J&@cM-l~?N-BM*yz2yFv@S^nG*4?lvHEFWr*SQdV1b~nF) zjCuCSgWtt_-D|HEwC9x@`gI-9MBn zi{Su7!U25nXSS{4R#fbRKew$Vpbvh*w3h(h|9u8_+dyAf?gIvOfUFSm1BPq^b0Oq^ zml)E(9)oUR*p`m2F$9gk9C*NokchF%*nBt)5SEZXHJT%9^cYjrYx1hNT0+<-( zyT4|04AMCs{0ir2^;5vF{vHEwV54I-g`;}Nz+O9sd`Bm(KNv$dGVGI7db`yaVNZlW z885-&g{Li|Gb4+`ue_RrknntA4CU&7q4ZB4{vtz~9%P8BeuC3~g3~VG{X9xE84q*X zVLzjrXfl5C*5Akq=oABQSPZ=XrXaFUDOw-hAs%wNB9aMglA|)a+o*P1=c?e}mz>-ZqeblCqaa zxOzwcWmUA=jc&i!O4s~t&T`e;2vS(c)lShEj?+bDr(E_Zj}-kK>C~XnDpEy%_+5st zn{>MfEWpeCx10#|=PFRg$PJ z+(v)$wZGsT!)|{9PN2EtO~6)vgK`KV?xb^29q-HJ5Bc}XYh%)I|TYQhE4{Z6i_N1-2yKbxA;vZ=k$c^7W%p=l<4L+Wr#2*Xtsym z@h5eDOTf6RD!gUJP-lMJS!LJ*p&q*)bzsQ%{!RMXZFFQ>4Ma%Pxn9eEKWyF?BGu{l zwmZd*F$@y=&_xf(>^pKKJXe|F{lAwCQ)_u&Ry;MuS2z`I$pOWeZx!|YgKca|J#Wjz zP#5Lx?@?7t+EuhdbiCj1`8z?dCkvKasm}GbY>3frMN9t@w$w_o zA@ATlA!sX^?`)1*T;ZYjg0$72T?wjuCGZ_3tr63|<0#irZOfaMrs z@GTGY$p>z4@Cm zKuC~fCa;Pg2s>0*itU;1QkJH49acx7<7nnkGDPOfI5 zRRl})SmdZDw<}h&FkRc}6(uO$u3XJ#IeEgVsaein2BucCbef>)w&f&Xc-3l_72Cso zoFhM)am)7CMl7lZ+FI`mvFH|!(PUV(WIWE0CDU8VvWe*K z4_D||cc&ku|0SnC-R?QJ9}4?LAzznp%OAnslm$h9FZF{V?z;jEO0BpfXx>bt|2^UA zyc{v2-;)!cbB&>&Y;Y zfol-C?a5AM4SIBS5;e#%tqgPOMEJfDNt7LbicXAg=(Rb6Qxd`_4`fjhEXha64k2(A z8zHMmp#IUyXUju5m8b?>dj{vb48G&u`k87~Tt~44EBMJnIhP1r+)8EmK#qLLkj(Kw zgde_Xj(YY$7V5A)!n*{Zs3fjH?g~gHsXEhrnIfWuDf~W^qfx<$@;{IXD3A2023dlC z>zpBG!Q>n>SMu=~gwPVt(VPrBdns1)w`I=?I4OEP_*Yv?auLbyBybfgEUVBtj$=Va z*_S|$@w14Gmll=Y6;bJxw3uZdHZYs=y5WQMG$61%Kllg`G;d8{J(^f3!jqh@fT|3U z+~9&(EAK>@mCdE#bpIHw!7TXJu31JzO@eUs^ag1}f704J#-u zC6Q?KLU-0VL%d35JDV4>xbs3!NC>FhuhILGZGfiBrjD#C^vUb4u^pjqg-D{4fa&>?E!Wzad3q`X{vwFwBBEiNsq zLFSbn+ENQbp^o?^b*r@5zY`%yZXWc-7~`6DKVe={M?2MbD(~p!J2Kczi3a7Q@h?OY z*@RVoZpsyotkdysA1~H_ttt6rgf;^d=3~bFZ4tUV{XU2?GBHVv`c$q=_n{>2{M$nP zu%>Ry15mQMw`JO8*-XhMvfguvlja?vPLshzdY$Nr9GMeX<#S)t;}Y!%63>&_j8=(^ zs*kbvVfht8g?nmu8{5!vi;rG8oV#R!jn}!jqjG6#af>Sfh`#B6btc4xvrSz>a&=@M zqpDD($^f#Crfh<@pWMG(O|20jwlul) zuL=pmvfFi%Zsr|1DxHN=kxyvmffFg&FtQfC*7`7^saL1Xhx8c^G01HIX&nY?9K=2R zjFQz3ex99(A#@IyIQa8v5uKPgNYoTrM;IED$+ z_R>y~fXS>r*XVAwVxWxSnx_iK02%p}7$7@xy^RO|2quhw_RGqhhETbmy0R13Dbo0r zQ4A5L|AlU`9kmAckJ*R6>fsNb7PAk3%}auJ$n3*!cuChO={+wAz96#?zv(4irlfCq zNpK^XefXPR5`0N!AAZ|Qf+xxB!|!-W@EnQ?2h~!JAG9F^b5+W@nfj_2Wj=zmH^v;94RzDIdvI^S~zq3=w7)7CaF| zm@ZTpL4+~shXjJrA&UbmwPJmIBmyPfO{8Y~Bas!k_`bFGlcw5hC1*hbuJ>8XMvDo zni9&L)KQfC)s!Zp5z~!?9>L`b6nB`VexLwU)2JV4AZmd?4)qIpBB&vOA`9V_v}{Ne zKDD8UCt#@FL$OMeUrsNfIo4vIh|`!amtFi<6S!F8&En%mkcg9Ra8vAKc~#-zJe zzGW79o3snv>Yf{d`p9I$e^;EyuJu!ZUY|WXmT>0zqw4(BRGQmH2#FfePLNH%15bf) z3EAKEp=9E66T{)TiX{~c1_Wg+3 z3%Qh?1@U{__DwIOu$()6QId&zEVNed^1^o-$s(EwxKc}XTtzBqXBerHwCaVPByRjor-YkK-POyXahKtDeW`VSN+OA} zTSF{IuPr(V_8GqK7n+i2YpUv0hoOl!h5-O2Hp&vNriZkb0-72(%NpZ?DWGX_Rw`vr zm|>-JQqF_fN`>sv=-tOe3zox(z}svn?Cmc;=D(^cT{mzU{;J$&4r3^nl_R3l zEZjRTQ&ex@LLhbPwbx4B07ngf&FoF$4(=t=h)5_~U`Keboc@$emz>{WWs+)njM73! zAzGa=&#~~Iwa~_>*=~Ncw6Y|WH6-;$?|eK%efxmQqK5hAIw-uf*ABr1UOKCH)3UdkiI>( zx8=0(fZr3c2B26P`6Y#7&;=bCIIx9d1YR97Imk+h*lcUcES_JZ$={F~;mUyTP0iw^`Pjp>49Y7+`_v9cU z9YeO#O5u;+FHK3_1J7@N(a|MWYqSes7}{dQqe9D@*k>9)`SBBA0Q-ld)T<`1AZv%Gj+sES0ZF- zkF7`uE|EbmSy&k`Z$bV4VQw#=vFHJWLhZno)lL267lfg@4J|2uuNq;{!e0J|@JKX) z?E2WfPx#SWg5xLg%ESIa9MF9c>z{nx0}*=voeJTPoA$pcH~x6@z!|X*qtlRczxLYe zvA(K2cP9?eN3xI_mTk3=|0PtkA449=g`B-*DFL!!&f$tTtV~3Z7h=dGnLb>|Isj)u zk*iTRRaTR{Xq2gc((q3QsiH0Pvq7pXBayYsROtr%M0=?+_Y~sBN&TUWUzSOkR~CdU z&N5XN;q9OXEd3)mO5vfy%_~nq&-i(bF(}az<;c|;@$FrBuUJ^f$Wb>w^~=xvjC>wB zLZAD-fiBG{GKP~N0Hs$we%}w^^wvR5#3khwgiCf)CFKEsDlR7or*gL@P$01!vXh0d zeN7giNAQy4r@s%e@EU@$H&bL1BHIdZ=DmoueRfGLf)j}J|O z_{GgDY-<)aaX^8XO8VilV5D^!|4Ty+ z_&zyz{=!oipT6|WFIx3W*(lIJ;_(M>cN>ACEax8ym_W^@xx#uqtF9_egG zH_>E&L;yCV$vd6R=q8$s^j!GKTkUQjofvxupH7=*`999vnMM>b5Kc4xbnKU28VsR;+w zb8_W#sb_U%^>eA`_^HolTqjO_KK(g)?8N7PQ_rbWpG!TbRzDy7Ae`yr)r0lAasoc1 z4#M}b&uJZ-bn;8onJ>NLEG-RVbDiSIG~v7g|LU)QRL_keUK^w${rWUM;3~eJ=iLda0TF?~7r z{Q76ffgw9BIzKC0{nRB!W-aiSbp%OT2OEFmGw0)pv+K+JDD(RZ@zZ2(OVhI2f{ek! zc+9NTD`HH`nTqq5m*5%_s*d2sqv0N$aWkJFis!)_FX+L43BDhvw=b-w0zy%LRLdYX zte3Kkm~Zvj-oo+YE2mG)ePMYCbauAAz|7a&yo9%E)H#4GU$>lHsx4E1Cq(i;wRGxC z9o`m4XAr)B_0*~g9(9g|#LoeS7S}6ZOughdbA$nKFOT25XTP6N|C!XGCDS2% zK|vlO=Q(3WNUdppnQ+JHCWU)>nVz`^u*ZSZO+s)3?l*@V;g_Nr-Fh{O)1=Nj;h&Pj z)E+);zAdUZOLll2CeGzRM`uEj<}|Qy4WWpoO0+X*2JMLBOos8HvnD^_AMJ>KI z#&@og+4&X?y)Pvu^}c%fUPG{96GQmkJp^r%P+MkM;1nU}s*^evSPOpJci{yUP4(5` zKEAM{{d|IN`5P8ld0E0XyhKZ-zm;y69hujAB%7jglJb^>X%rxA(gZF*^GiI zQwT4_)FD@~>=>L93QB-~Qq2r$7f?cvqwaVen#ACC5QOm)BV{teOcZlp8mi#XxEP9xYKpX%8p}|X(?bp0A z7B^0A#S6tH3wtYjh3G1+G>}sX3JkJoW3Ke;G|g=m`}=oQ44dmSPmPLyt`R{VRd!{_38yvxS2*!Mju>0ERZ^O(%7tYmD zD=J@^6HcAu6{9z7|0Qg={2EyQYiCTfg|%zo`!C6+r&c$vZ6+GF9;C1A&RznA zMn9O88h^oFzdcUeugc@yv2AETmL_vWJX3GbfzjSX%rHJr_#5Mn^1ng($;GkceU9cq zH)8c{nErEr2cPphbzos7@1||08~5%VdP6#}&K;luJbn+>{T>|9GB%Z#P ze|6f}^y~_KZ%^Q6M!j#-xkfwy&K^3Px{aH~3p7{z`OB+y@@+j=e}ecEgylf&e~rlj zf=6LG?C*Q`I(%wU#90193C=0S>CSImQiqsPv(@Tr&g$XghH2xC4d0ZF5=#CPckVJN&opw$+ zS)`iX+dU3)o{#qPFYfZr;-k6#gyuF0`R%IXdwVoXD*$A^qzLV3DiJCw41%k%w*7ec zh-4H*fQRU%OL_~tsu-T{j}hSbRSqbHNJpkA%I4w%4)`&o6kk|kp|eKIWrxMOQGg(AV#O}dm&Fm~mall^?CM4t)!SgPmJtH8aKm1z5Q^-G zcPS-D^1~?$c54B#_8SNpFUf-Yc{fRaR`lAaKAP`iSMo7`m+5(S6VX62iIv{kCpMgc z)QH)Scg<*Oy__qmwIv?gL?SeYs9_Z%i;gbPVRAgx7&r3k*tgdc`}VpS@W9=-*EiR? z3-!Swb(SjU?w%d6C{r7F1J+;M#r;XWYme)=i!iCedd=o0^##9oW$5QUw-<7Mr0j7o zh=r$yGABf;9F|9(ZsWBNq8&=jbm6di20@}l9-NUPi+JKsc8DjJ`?KeaBGtL{fNoIB z6oFsDp#=4WaijHYH$h$9%1FldYL{>t;Yx6pxGaeYq=^^9{ibUmwSel+AZi@l*DPV- zQ|wnS-m5Nriu1}Myq$hQyk4Sz^m&L_)s~`-a!=&NFC|3#`=j~#iUzEs{hq-a@^CVy zTPZN%6LUZdg#-z60xfrmUA~STI>5g@{G(g*OJ#@cmH9Pv_!4*cn!jG>_U7x?c*&OQ z>kXt(_w(oQWtZIc6hjSu&#|@E%VZ8R&N29|Qs1A;AtJr8nQMTZ^|`fwt0w+!!yIGRG!7j(Qp6c%@|KBcsQgjpXBUo90f}bk<(bR7LNu<<nxsCyY)`G1OP+QGw{(_c){7V|vVtIdfaz!k^j+AFh3ojfpPw%|aXE{)y{;HR zFhhXp>eXNAaV{IWC>^j690pF&b9)s`1_b%hF8SP{L$V3lqY@NHjjaxQ+uF}Cahf`Y& z9smu7lK4p@#xFW{Q49N)zHDZvWa15lPGc-dXs^~lZphPr-{R`Y<$Uqv@+xldNV%EL z$7KM^AINk1_(^0uecH*h;^tY&@?})Ck}sY(E}8tA%Bq$Zk1eg1b@`RKwVggCm4VPC z9(^|W{&chc;x4i04QBRVoDgLKcYyEDwr0U2%Z z#0|78qP=eI!m0oe39Yt0lJB>-`R5m<%yZxX`S)#q$$GBW#N}M=i}DL=dYjza)~@X4 z#9oxY1^&u&IA>}v$`4%c)Si{Em*|=1{3`GXbTrXq?HqpG-X^}Z_Tp|n9`0S}({)Qh zY6_#;#nt`%TCaY$H|cKS>F8KK>48RMFrx<-vt-j(-SqXoO+W6Ye+6;b==P?Dzk}Mf zUQX?Q)UGb&T?*W`-p|2WP+Yfbh?qt>Bs1mH?1^ktgXE)nkWW6qqtX}2coml}Tvjqt z4ag`T$XM36-sgsq1|F>|!R8*Pum6v_Z)!@42XvyK~fikg3yv>vuRSc zi(pb5YtC9*2>kjy2io;>A$B}q+HCaFv+m&v^4N90%ZFUj|AZpS%H+0?Kri9Y5#4L;f3Lf$<35H^Lo% z;uSPu@W*TPvR8-&_-I1kfU^cS!}wKrZeKM|KqY)7lm5fAmndGdUm2JkQcfnhpP6`m zb~~E`Z94hbiGb4@))>KYQg@>&9~a9sBvsDaWFH68ambFkUx6 zZF=X@to*{r^!m85{oIu_{}e{*3N#CU_O(I0u&~gnK6(w|MCb6ZIb0{B59M&>_&Mlo z6Ev!ghoL;QAsqJeGw5@EqPdA@pqrb40cKzwGn&pLcu<{4?sfS@Y%}X9B4-wEsk^U+ z6WpftB%jcCSQ0nAaGK zt2Y0MqBM_qQCDfgxI~)pJ;JDxS-?R}nL;WA3^D~LtXQ|JSFKj-nCiz;hE;05Fsr#k zk7xUhfm4EwOxn;@iBDc*0l65(N`1e4AReHZeG4U^DEJw_$s<+zA~&diHlmDR9+*>% z6ntKC6qWAQ0rK-FmC>RKZEl=Z4ti+n@-nPD6Aj}ku5asn8y1**ebJO_)7cQ-PF}jY z2`NiuBinO#ObRlB>ILGAjaU<-sxmH-n>Ul_{9D;}wH22%4NYTvN{LU_vNzOF#ML{k z&QPY_53qDnkv>}2(7f^4jyIc(qniCY8>S81eFEP1%vqE!g>e$qx0 z>huSvm@+8BR6Dc}?a*>|Dve!LD{bsN8NMNcEaS$I>I2Hf`zMCP3bG`8=#8=aKopQq zCmSO4?$8ZuNRfWYkV|}S)fo?zO5&K1HkFpT4&k;4l`2c=DWRo;A=HwV4hic{fprU4 zxSmSb@Y9WVFRv@7$=n8_P|NW@hcz`B{5XK8xT(x((ItHsqpgVB{oz8ya3-pg&B5UW z{be%bbdqF$f@(ji=xy#!>3%Pm;?=8UW#?&bVhJ$($wSCP1|kP_nVH)wW8D50*`M#| zD?|qm9Tjru6wf@-pvI;`%N6tERSJW+(cST#-%!)T}+xW9s1oMd7-Wr>g*T zfHNuDl&q3;g!@Z2f8o@}KSL1FVUWuYL7%Wp+DdGH2e_kSNbzq>6nV_RCLy0pHl~;y z!@>jwSEmUreQ0(Oqk(eJyO$8(n>TM1-gz>)UPX_gb)Mc`4_QOYyC(hdI+vF)rIHd@ zOl`~&NG0v*bcyE6M-&(!C6cv*-E;9YPsDBIL2CffMFteIi(D!Zy{7xSTYN&hxGyC=(@`X zxl2aU4VxcI)Omj?8oLFeC~<&w3X}cpn*m%oc*+OhYAjo|_8Y*gROZQ*>KBcFhAFLc zbPm~5I>iLGXT-tD&{OKbRwRQj1LLc(2CPtSI(+BKzlOEA-uAXid#e)ql>VCCev+#B zKy4p1Ys`twdJR}R|cznX6_XZ_9{cxo~ue%&p}_m`S)1^?50__11&?_XNJ zyLazezPoqtmF06yFKX8$v;Mw+UDnOt{n_7Ic6QKmv)5&P4Vl?X%gnA9GymD&M48c5 zml^#p?X_f1)8)G27fm3Q_TF76W-g7NkUO>Y`U1)+ zekez;F==ORBpah)2EpcWR~~v>_`27RZ5X|7>4(=a#%s@T8WN8TG4fX54DU3Q^Z3F| z&)fuVlvu!zL8HZnGx9n>O7vvr@oU)~v5tpP0hoMw@A4IMRE6@>6g}x1m~+ag$F;Dg zD3MO>R{cVVWy%yMTUj@MqP%htQ;by{#Z{2)q9`Ex24|MQu4(;|R#@Z)C7EGg;kHC% z_8nO`IPVR$4UY{XVs$<(8k|nwoICOrCHd2=C3Q=Mkv=?8j)#xAc5mH?#2uYpjmurg zIJ4e)yTNK!7P);3gdn7o+?5y49E>*LX?OX_oSxpt1q= zUt;!HmVBULSxAkEkR`u(!%QZv#Q7vRGmgA(iYExtVw>6?)8doka#kAdOn`qQw2aVb zw9}TjrJ=l(7jvE1X~`_hu<9)4ILqaTTs?{(lsGx?gVd9O zWr2MvALM;M#E~X%ly8RK`SI&Dx2QLQ*fkh<;Ej)^`GNqODP)$&%A;@>FohHbKc0zZ}d#_zc>Ez~v+r-wX zOiFv!`|4Zx9l6!w6=^;xQdYRd7IlfbpQt~O(Y<$V*f|}X+#gv>u1}~G>hMsad4LK+ zomo~admL81{q8eK#au$H{D|oxWyo@i#l?7sU2~i&7ll8#3Voelk_7h zK`K_#|M1fHKRCWcy&&0%Mzg5i3@^=mzMtQrP-)r@{lEW*{fY)-n2t>^eZ)*U319l@ z`}rpJrHS~`ttm}=;oT*x3+%)~l1w}SHv%vq*A>%$p;Q24K%Bp{<0OAj^gZiLV{BjD zuIxb5DQRHrwo>IsPW4gF5vr;>rwti2LyBrcTphnJ#l(@w`%~u}e98%x!fhzVW9m;W zQf|SeN3F89pek)J7(8a3uAtg#H~G*|yS@^B7&2e;1jxT{4-h_pmU$5Sirsd~i9JfuySU@5(HzK9t&2Y22QnZkIUY<{I>yhx+0Iy|j{uv?C z_$(xj&p@7(0zLt;30%k1s)1GNdtZ1XpHN8se=ER3)aQl%wH z*O=%>CglBdU|Q=a#F-_PE;_y;4m1*%0;9>J^}kT zX_Z*#tPt?qanCLJUm;|JW{=(hT;g|sjxe-g@ZuK$>pjoMCS^^SL6@HP4^w%LxKId5 ze*!rEuP>s2mU0fdj^AeXl4Bsqj^v4yOu zu1`p9!eZ?#rK!O$H9aK&kz6>chBU{p8)Rka*T4GJ65x1nd4ryw zsXO7M)4`6;?J>YTMIUvT)$ZN9m)%}jf5~9V-aZ#ocBssg9gQO4gVPdb{0=;#+@`%& zGdUu0opxd}kZ%J%2zLO0musXjkNIM&g(4B1%JnH}!gngaCUS**1HM3xULe29SxScN z4p5rou0=$pyjZ<cMUKH5v^r2fdE_DMj^xs}*Z??VeGPVo^zs%Dd-KiKgUh z`k>zDS%O)m>b1MKtyJ!aF4>h?fBE)i4R;`TDmiA0dq$%|CCgKitfIU1JMjlhsi>q9 zUw2Yb%3zcY1anB4WjA^`{=>D*e1*Ytg=D3{WNFk@O_CLgl3dw1DV{W)?vJV>u$NX= z&HSyDKDeE-=9(-PNY-YQ5*Kysn3dC~;_$eoW!#YEemeXTPcFAT`K5@Ze_!C2P!It$ z+DG`p@|7&F7-Pe#`D__Rszv@Ps%Hdevv{rMmQ%7UtLH1+;9nz9KKzmiZFo{S#mu-9Z_)?Eg6gG6Fyp@N#AKvut&FIa>t_fhfyidR z{jxiv$Lbt#HWBX&iGb+@qUNja843^Q4E6M%&6WJ`PZu6xZoR-8`^4#Io)e1e`S|`L z{Nt1|VqYq2Y3JZM5hI>z0vFUPxH7vHwYE%!Pqd^SPgzpWN|)4V%-WGbNDqCJ_k+~S zEvaM4gk|zne;|&+T8P)-KB$dbabLOqJic1r_uysTY8GAGR#DqN3>L~3vr3`-O%+~2 z$%>BZ7wZx&+zS~-E(4r6L2L^UR#2c8dew%=P9*&9m8(s?uAy>+c}8C37a_`m9y(wR zNX5m%7f^RSj@Hr(M7F#XjMGcTf?R@@UD6r%@~cr1~rI66N(rqKQ56W<@okJVP2rk0zPx94@N zqWSDX|D3Nl5_aK#vAe?`yNha^glx6twL4gnR~WISU!S+$rwOewZ3VApG-1+g+ZmIo zgcdG$f3Q3+tZa-gWZMAVbrMdu9)_@M&o7W6bQf?+KWn~K7a#SKf3+o!qffTQf$MCG zPVMrdzZEG&&Q8<;LNxJ&EHosjaRCHs{tJhf$O@@*QT!)@`et(w;mje(9l?H2S~w4q z99nC`!Wx`Jc$+7|zJC z1cPUYBR>585%5_X ze+lXvtjoaW^+xpm6ni;PZDP++Q%}O+Hajw5lM|kcCWXu~VH=%kZL9;*dJ~=z0+1=g zgDs*wrs5a_&tt#YZ;c$i8Y#VMc%UWs1}&rjf9Jn6WIjR2l`k=JRXuI)oz-mhLtDZ; zZo>&GMCEOa3kJlL1amC&AYTWDEGJl9e;NO6KuuaqJQV&5VXEXz!nM_V(_cR z@WFD+(FRxy9al&o<`#)Y+O4eI8bW=Gz_9_wiZp}CR}D4Oy~0%hj(P#zE!&{OQ5aET zq_ip*{9q4p0n{ZTr(hs9Bwf)i;1F^Q=OnBn$!$0p75GSle!w)E+8Ob;w))Yqf8yPX zrz%)dOfC4v3kQk0jM^^@LYzLB`wMt_hd>}=5`3p>1yz>Zu%Ol*v6{4sHGxb;2M zd749$vT9yn_n}MsIE@S^O=^1sK!80s7QI<)ZY-^!6GlqOtdWquq1)z^=A^NJMmAqt z5p$P6z)FgM3m4&cTe7}%(q;c_f9J+gUD+K4z(}4F8O!q-Eie(0T|pC2!3FfpZMVf@ zw9A5PG6BT5fv%xAo&lHZB-(i`?7MIVfwalk!X>j*gYa$JMqCWEDLcxsll!gS0JUavIUUlD$2r!0!EBG zp>eC=+HZ7$I}^i&oZimih5Ymy9OkU$h47COYE(muYA8_+9jc*1H8iLO^3;H!8UR#- zcxu?seB{pJWaN=$0FRkK$uPm;)&RuPA-&9l)RoVom@WbeIvzVue*!#^zc6l=17rb4 za4$zxOX4#8wfM#Kj$ul~tly2;v4=7|lofBcpLX{?9nI;03ESvq1lQ8n!?PbpbHl;O zg?M72q=c{_d9knohyhYE)q|PSdjmkiZhkQ!A2GqtR3;l{In`9^)TSEGkXEuugXlNP zmfNOY(s!+m_*Q2HfA}_TBm`H7v2TL)YuPu+IN}x@_R2s<3h2GbNTutA46{Ej1~E%0 zKIuKIr0D3kZ)w%)E09xpGnCR(p18nSjAx79*%3PH#2?IL6U*2ET=a|8bd3cpvyV#I z0VkD*ZMQgNq{}EBt@4w*^f)&0|9hUtPHEr|Wh`Pg{nLejf5GcLu-MpX8y?a4q?6Xc z$sr(@-sO`tzpz~6l^jM>Q|*>(tC|5aA-{{xAL`N%8z`vMWwA$Ue0}!nX?+Y@ZhbD< zQ#l2irX>ofkd)8n6)3JJvF5grYP&YCp)Mr4d3KPnE#B#n%u%w>R zm@|>hQ|b<}&C|<&`@A>O`~O1i_AMXLN>I?JW?4hjoS zB55rmZj>R=i!39OjqK0tq!Z`Gyu+q&fK}KH=crhi}2<~X`Y?rMVzf>PC|1$T13U@fIz8RI(?-nZ)XhjK~6w(x5 z@%Z z76%l&4>iE?k8T-~?&1UsmwG#o4B3gUt>%v|3i6&Jm3UyOb!-PoO+?oN)Fie7-Z+#I=-dZyYy`30i~m==sEfETW$ zk?%~jmI)+meK3Wb_G6^|5CH>Ne-oI#iEt?ztrm5C(Cnj3@f~oE#qs&NQ{UDu(fH+^ z(@o|1H8xjR3#Wa2RagjoyoR9eeHOK}aJ?0cr|CDI+*!`Z9x?xxp67Y|K#Jdhh=3ce z&~zMGbNBdA{QlZ~CsxrIh&&+G;%1@Guz8>AS0Jf3sPE0LU#C zhsJ9Q60vJIt>zMF(FdFp8_aV~H%Yz&-44NEk!Ieozkub$meUu-KKOQCI{O1p!6lBM zl^zGa@9VATE)K5pbPK{De;2a_-fj|Hr7(_-C^>R<@ZKnd$NJJ4(a@<*;+x@5{SZ8< z6?Q}GDhq=v!7RYH=PHUC1b=IF5_`~58~IREtR`dZP9zd4I2e{x{HTZNp)`*b4yUxN z)@Jq{lG779f3_y=>U<+^8@xgt;mQw>)veT5JVaUZRXoDM>@@j1KE)O|+*jx=up9a@EhJHD85$!*eV^XR896CECjY;{k-e3uD_Z zRP`GiR$O&_YQKGaHXM;quq!|cQDGGHDBY7b2|;Q3^>yHN>1Ecm7|>VpNaH8rgp)eR zkNVO8n=7=P)abOde=n$UfK0*j%y|}$X$Op)y$kpFoqaB!hjp5_vF@A&d zG>p{@%2;C=m3eP_D;_pQvqg@I{$<_hJp#1C*84}m7lfFmG!7+5IvEU&k{Jib44~6v z?uub^e-q=N0RkMl=XysG2ydKjR@(Gzt`3#0lu?!xwWctqe_I$=brul8P}d=hlq8D= z5moyxi22$Vl@&$tq=Nh!y9=U?KKA1^v)-u8H!o}RK$Pxu&%Yte^@GKFm0vM`|G~rW z@-N&lXh6ah4M6A;PXVtATaC^_ef=T~$_2A965tlT?*f+oumD8PItK5;<`8dX$Y7cg z*<^7lpY*UGe+Mt;+E9{B#FW&9fQRMj@vwi2McBpL&mZeq17)IAv))G(#ANW1DDC_{ z8lZo0aov6D*BwljX%i+R$kCl^6RCUd7oTcbnedG!jv9pR&Y=3(^Xy@IFv8w?*2xHK z9Qin4wCj4j8vV%%_Xyy{ZGk~Jtk}flH@Lvf4zYSQe{)-hG&61qi;-yB4xqdCq6gSV zci4zD3@jf0K;R3t)mir?!ty}kNe$N6eg{)-E5=$&7AFn~zA?j4@{XO(S3}Y6V$3(7 zqi9G#gTo9)p`*qd{@GO!9WOcj{4wky8QW*{sEb9~-2t~0eTjX>Lft-u#5=lt;{c(U zEoN)Oe{q|ffHTyR@4D2agW;Qu`P>oo{G3MxS;H)epr6P6EU|tpI(O#CCejy$*5GCwDMgJ&toqm#w9~*HWFfK3xm(Dg|zt z`ijk7n58Svl}(o(*sT20Bd)Z`)y0LoWD(7ThOt!?wZ| zs=@u}6|7Y>HmD;U@udL$n&gu&B((_XoFHd2+5pdV8#NE}LZDAx8fK+z; zO1%rnrQ$C_)WAV4wSw>~7$MEGlYDj2O~0hN54`+~D3=wGg|&_iVtJyy3`ytBKT)!E}B|>U23FWL&fJU$U$~J++)nNP!>%8IlGv-F& zUWTJH$3IOSMY9ALz3&Jn!-uO{e=+n}!H^*`xGN)tdCLYJsev1e7ZZFA?txyGa{#%g zemY^2lo5rbPQ;xck%-pj&Fvz1pUcbm#RP@L0`CzlD}V0MxdS_nyx`{GaVUf(zB!l^ zd%P^VE?UqBKp}TY9;qdHNn<=~&g}7W6X9mT>@lo4N^ix$>gG+7Z#ec)e}`KIcqIZM z%{(V#opKV4IH(V;4beKKhoui&Aw13-%@Ri}eyVY=8P!-aP}wTlr}yE^a^THrhjN@0 zk~I=XR?@kAZsi#T_qo_Zqu~==n`pRvGCUGg8UK^aIb}a4#MD$tP_{ivD9r%@j#kGX zVjEpZ>AIA2mm)us6s+yEe}2=2=z7n;)l^912CJ#E%7iFOzpk_z2fzB&+pAA*{_|h| zaCrH}H(yqriy1~{iU_2Q(|7<^vXi+HL}75^qWwVRl$1T$N}97h2?>jkwg~jzZ%@c_ z2<}{x7l)n8G&ePqpm%3IJuwAB%$lc}WJ(|50EBfU$o1<8X2<<6e=BP+?4Dy|Fq{hr z$y*_^pjU7m3^Zy284vRiK(BGb*W4RnXe`;)4H0vhFb!S`ME;6sBRCpZOmc+<3YlMX zg(X%(KW9+Ogi?;fT~T;(AbYFOOwhw&_+zNbprI;qlR#yA2Lt)8L-SR2ay9E|B!-K6 zsR74i_M!?h9%o95f4#V~nuT#-oQ8PWZ5jtCGh$K?8PPc$VF#4#SAYViS(;2azr9%72iPDSb#-_9d+2oapi zdTCRKZ2ugxQvv0jcDlYLq-&_ez0ZckW!A$3S%PG)yullAf2OT3es<`-UOSK*f=}Ig z@V5#LRXj(aeJLP8Iykic-eMKui$@U5q}91lvSroG0XvXFH}L>QosimQL;W5X+KJ;5 z1`2aGKYhk@)nd8z5L20mnQ$=l17v*^H*6z5ovScRwn@?3Kt6%++3|tG;CAhW-CWUr zOUnAzNL9Wze7igPI-EWsM;1oU+`2z9pc5Fr-SJEM-N9h z+~Yy9!~KswF1&Y@Y+gdVPd(^uco4oP9Ul9JXw7jie`sL{8%k1P#q-Y5ie}4KFUXefHSNSF5G@w+WHahQ*R;P@v3xgZJfe5b6N*z%SbDaV_M3PXwrGXsE zERe~iBv%okJcd&uz7pQkRYUtb1c^lij&vXr2ppMhlmSm){?{4l4z z-IG|NQLoYqo{W)2Gqx+@0+BEPP>%u*Lb&`Z?5%vu{zmy^ughj;=j*=XtzJ{xU1WRp z`9|`*=AuDN?WkG~!kRZbNb*F_ZrIq72%odeMHx|mr}Vvfbao(JU-Ml@P?Ck)*a(J21L7s$xw(d3i6=7z5`#_!kU_Tq$ROYb22umXusPggWnhb@l|hrqED1I+BO@!5t=Si3sWg(qb<$nsfdD0D1t@cRnC0 zYOq8#OVu8hSkYsy`g@4f1;DvaE}ZHze|9ae-|Funa4jRSU+T?>f_wGt^uOfCVfK`{ ze)TKMXXZo5=YPo$X!&ODudI_%m%2~+`6**QG$&0&3&&RTv1Bgt>wjI?z+iU+xpW_~ zv;O+uB5;OX0q8hTSGf9pCZ zWzme54IQY`w9iATeZK_}3ge8<|L!mTmS`BjYTzKBHdj|y)2vUUtp7!_D%zqe0AptB z7pC~qI(R}1Dg`u~|J`o^A%8IQNDn-0BtMo8bDgc(g(*JCg zqEAvO@Rv&~wNgo?=-`W!A%vw?fA{bH&0nF~NmD9*XD_vAzQW@!Fu;6$f02B<)Q5V@ z>ywhX>dtcg``4j=@BHA>{+*@$`^#h)r9RYKUSF|)fD`aDEgC>q+rPI-`485||2XZN zwG=A7clWkcT8(BpsT2U$QYoOzrBdKpmv^+JQrysUtCBYUQtf}82Tj1=f4{S80rD-) z0M@%uy|3*mw;X+K|J`c)A6DD{yiRQ=s`hiU@;Co0>f81EWFFJLmcNhs^d5cb4<|HP zeno%*t?9S_JKvw#H9^X!9V)5nALKHdbZh3vnPtrJKA1LWLunba;e6OOlr;p%AYM=O z^>*`(d;P{<@A7qDU+*+`f8Fa{d;JxJEze+cMa?E-pfirU93hWct|R5vziK{qos~nEe2czNtJj zL-74@3psLpwhRcc@`7! zo#U-SHh0Z1aV952Wwqj_&R`+5Uh<-P2A{vr?|e-1Xc$%a!f4}RB{?HE;% z)3eowJJVF8NHDp{B18uKkC0fi`D>3Ana4tK%Qb-nrOh$`_OkEtXt*v2Vl*Q%JR)bF ztWp^~nuwL~t-Kr{>#XvlDSF7?@ncFP5ROR*8QkVmvNQVWY@Fo4P|}hWba@#xdO6z< z*pVJc4&fQ$e_&JTH?afdn$ljl*vQ8B>#b3<-oW45nnd`k-Q`qN>I?0KuQ#eoBUH)< z{u(x)Z@`KPt@EailM@Vkn|E+`?eO4}u%xYJcklaxlj0>$;67?b zXD&9Xw_fibDCvZGX|ocZv|;ooPO^@L?&zIf-_Pn5e;~HSfpNcCXOAePmD755K$S+JMX89p*8zHYvHYK^&|&pb!LgekSUqVu9{wXP999 zZQ(g*%s;w$ix=?WDz0J`y@(lCMTEG(nq}bj<*jko;M3);dnDiG->bvnRmhKfB)6MBE}kf_LXn`LjL_rNtGP}cvQVO`s-899VNYM@!4ONk}HzFNgY|3rvLie zs=tnwU1iIzt;Xd^qWei+kN^I?<_HXvqLJvn7PRMjZ3X|`%T{V@e?X?rHp_ussXXE~yC1R8gr+h+^Y4KA zEos`ev40OjW;EzC|2N_PB)|2wn#I@5;VQ0~fV+&6&Y0xlMYSo5X5c34TplERo7YsA z`Iwop(Ln@4ft-F466%-j6X@OvoBUqxkK$Qp<2XUf9@wHyZ=BcjfeQ>EN5KMCvBw5Xqg@#<2N%%i=47FJxbbf4mDh zQ`$UZwn6nm$6@){S2aR5A{!YUpG)Xwc)rSB&Q`okzsBCm-(NBLh<*Y7=r&YH zX{pokx81WZ`i^TN_u5??W`hNIMeT>Q{?hH`h+z4fpKim%{kk-7j*%K);ZHB8L`5~Wp4%>i#<#?&04A5*i0j!ezs zdjg=DkV9PF(|$~OA9^t*L+Hbl488}+Sn=AFcA)%qXute@XukaY8aU-$fAXIH)H&r6 zmXX9PVH`=YQfBcVgU2mnU~~MpKKhnpu0Q7MQ;t97@{9L@__>C77~ToDTaWEtJ!Cu# zRbz6!yG4i`F?(_@D|TFWTX~B(&vEjQ)>1sA)kIFt?SKFZ)Ko(8tLX2dSL)9D?GDIay^La@}D! z`a%|*C)$zIyhqE8f5399->>PS&2+)1#8EHy2sV=7-#PT$9|eD|q#z7d$8?0IhnNr2fI;ivh5BNm#Cz@66@U_FE$ps`K z-R<%uzGg_0QJsQYEQ{V-Am>4GGNSt`fS7H3K{ThaJpqd$T~QxJhBxp&id|mPdLsmy z!q7u1d}EnWLjrTp*9M-6JqVl3cMdV@=D6TI8;^8cHaTytT(1Q;2hMn-yM0>C+YnKv#1fKKXzs{%> zOf6}=ZnNj_=>c0OrHz^@F$0-(cn0*B1i@grR3n+fq z_e%#v@<^XfqY-fpx%)I+(nQHr0KIfDasq-l1^!tVe_as@DNLfCV@^*pE0zPN&xdjV z5W=iuDVk5XAB16zTf=sr-kOe0Xb#uhS0FvZ zuF2|300A}Sqe-He&w65=QnLqMeqkaNa;iEvOPTUc51f7QJGX~Z!~2EtuAg;*XH*?^c5UDp!y z%}wGnPYLqm!HA>}r*MzQ$XZA$cp`Fp!Skenz{nFgh5(gr43|B8{Umw2dfQ%i&-FEI z5lfW5aBAx<^~(;neJY7nXxcTC=+44~kia6ICBOx0Ajms;<$c%~bPVe)uY}yc1tZRj zf9BIQ2I$=IN^)o1s$ZC0RA(s7)fi-o;{5cwit{j&TQTKE~f7hR!6AXdaT5fDJzUp4`A+y>oTl`b{uma zYMe$9?sKC9V$=XfXL-iSUQBAEOg743Qxx2?HcTMT3vPb_Xma!*K@%pg%?Oka1VP~ zi1W77w9R!X#34*y5<~>(Ve7xgu7=)}bk%=HKCH=(p~&m`n5PMt3upi5$!TVWfNu7& zW(%cm3{&Ba3w2`jV# z?X#XZL8q4bC5MkmNNG%1d7(RQAEj$BZHIyZw51BT8qi zj~&s{!Az7fuQbcYz;~ygn#5RheF&k8a+2V(Eu!07v`W!$x9!DFe~{+}f9UD+5g?7F zQ4MB%kO5fbgT{Tp+{iKXA#?Lyuzs`WM*5JO8wuCH+I$;_`yNt{;y~}DV#pKaOS<*} z122W@B?B|cjSC{!pJ2cEO6;=-HT!jWS#X>$M%P*njS+yKK)qgcyMXKRtJMPvXb%y% zcLYxH_&I#Co45}Ff2pA|ln2yv@BBgs^k@~p;LRwi_jA!BMVh%XD1rh!GRMmt z?G>(BqDp#fWC40hs)1X6S_L?FE&oa|l{}Wjs}25bas+;`={fJU7rBEr}p{RlRjwq2-eys-Jb?S1YZUcy?jv_ z-uHwe#p^J~e~WP&3egrWjRuR3up9E>DlYpO zBHWM-dYWkv?3N|dezOzRhShoFMoGu51J-eg<^^0Gu&k7asoU9b_~YQXZ48HaZa`0wGs!<4&mUnq2KUK=lpRA zO8jF*e~Fhpy?^uOM!TG8*Q)o1yX>a5@5GSZt*1mjSx_3qmNpT6yH!F8><<=Jz2yMM03+?(gs`( z-Kb5$W4fuZjPFMC_jLbS;O0Jza)d!{OV|_s9L1LksQiZG`tw$ja;|1u+Q^dep@aX_OzZ+$6W%t<$ZC2&i0-NbInjSUT&3W<4x@eA)y z=e--%s{o!(sbb}{SzEb#_uj3_1$vWnf3%gM+rWsFwg({jR;1rqjkP=7YNNh#3$C+G zC|huZjdet9$=uq&fmXYF>kW>y+A99+fD?_5q$8l5IQxQ4m{_cD4(=a0n|1K1n}h1+ z-~qS)&H}oR7-p!3beqefj zpikZNv5eO9@j`PqdvyOfD}2;^e~y}%TOiYPZ!>*?3XNmeSo1M|t9bZPhVhnXjB|5` zPmdPh``SUf&EqjhTKizIz?f)q6&g2ppo5zZij=`YThjnq`*F68$MnjP0qe1flmWY_3=wyz<1i~Ae-J$38_tMWrG_-|wSe`H7f3s}#d<(1i z#&Qb@slXZ}htS`OEQ^&a`Ravy?sYL8hC{G|l|Wh;{1Z2P=#N_TAjP|7I}!ZORPSs! zQmdr@F`U9>CDY;v4(eQ8(D3b!??3f%S+K~m1qcJf27=%9(=olzFE8K1GZ|mJ{v>j< z(QOUk4_GLS_b)FmEAWd?e+NYNF6XH$LVoO_J%?E3QE~Fqtk5W=d7vDicJ_JjrLV$M z8?1-5tEK>#m-)1YQ8Kt=7qhoaKL)c_+32{5=h1;_aL_|YqQ&rqHeu^WPGcPOsXK_2 z=2FX)WBu~?s;ujSsY`K$;j>Ej>277Qzl1<2x50>J3mz<){f9`tukoqGwwAh7R zsvoT*n7Wj(0Pa%ZKTWKkd8vqps=B^A2Hs;~{_C{%xl!poXbur|Mr71owi z;>k_T5i-I?0b10PaRp6jFp~_^+oSiB@M9mcc6!()@dB{`D_s}ssj$hvvQ7SZxA_%G zAt}Y@cB-GBAk8-ZfBNU9(PEPBJ*3#QBgyvO8?!Ph?!C{Q3hw*_d+!Qe`Y`3*8`#R= zBSU!`9Sn@>4iP$pBPzcyFA*sQwg(5a>-7Ue-?Iuf{31BrYfq4dnE3`fB6zIdx{eU!GcrVu7{TG zTKH-JD@45(3{IM_V0x;n1HT-r(E8HC0~MzC~yuEQNfyhl5Ur zH&E{GAgbaqe_-DQ*bwpDAS}LJ$+-wlp?c?!9bB zS{Co;IB+bf+U-`Y-l)~@z|AIpwOC&O*QM-WtG4`E1EIZPj+L6o4PR49s=hR|Msjvm zK$-)9k-74crr2!-L7x+79=lYGa6v$kH?*FH1re?TP8zVj!TWxZOTj3fE+E?F@Dyum zy9s-`e~klIhB5J`mQdUl6xv-xDR1Sy)7)O)xxc%b!Kia+ zMjCsd|K-I2|Id=)e3f6q8qk0AnmeE%yqO^-2d&u}UDla&?& zfb&xrZ*_jk7x+~JIgfQl`I{NtX^SUud4`u*>C+Kxsx8B*t4{8>5s4#Fil6U8G+5e_3s6?pP9gFbvH)u~b*5)HQPbv> zXk;MuhN-zTzLuk}Bs8a=t);KK@1e|TltWgSCmgCo7&gc8RB6r5#mMJnf&HdOzc zE@sqHroRELk*^0g)>J8j0!8s0d^CclX4G0SZG=hdT-!muEa`9|cA~ieu2e2uS1fEF zkW(DBu*MJ)B|a5GurN=rfTl;7AjaX8tnqO#VLA1ff=z;djhL8m?6Ceq6YGX%fA_;C zz7UcvUjBgBa7(f7E}Azw>us$Vt$qQ&-HM;_?a-55?55rOP71xgyU<+G)_2(YHm%{j zh#SISChj;ncN_<8{VQ$Du?7<%-xt-6uUnAu-9u!u_{=9Rt#AY(#+s%opr)Df{5BNU zh@FkQ2*h|MW}aUdF3e|hSoCu*e}@cS_6%}sjV2zB(G3jw#{CX%XdC*KIM>0Pz%qD! zL&9&?TS$TB(FO%k7wGaEp4|f|xO`mZI8^4I3zYfBhsj*Na)Ru33H80O?cp{Dwh3^) z*P(w;#o5k^z#c)gR+Xtfj%F8zr&p)CiHza{cycSREcLe}YkM6$8ZS zc>sNS7Q)JQ80o<)Y|F-{BYT2id|)Hee1Z4K{Lcn*g%S|0T%$w-CDP^cZ1Cd%#$aC4 z!=6ZVLoA980VgENw$K}xGL*>vK}#aK;ez@35Yp5=JF+#(DChT730I5QHexc&8AtSo zM4_8D*~ojwI4cO3?|vQme_xOMub=&|@BAMpIDcCCTP48_^5F|$ zCGb{rYbY;k2E8VfLKQVwYe*q{!XZ+uQ89*5$b>7TTKh<)Pt7G#tQ(Rq2OWI{$6_@jmOCVZ;;#+X)qF$NYzg?y>50cB?(_ z`g+JXmQh{=j>XV<`9)#q`Zttk$-f3eD&uc=6#93?p&vi&)YS*fl!ksg9}lU zCf+2pTD$SQ;}oH#Upm8VI^8y-Fv;+GiP%8$ADR$doIbd>e<3c)N30QAXXExW-*G?A ze2V@0fOSt?!|8MyteE4l5M9SOtbiZ)?!4k4liyb)SS_=oMx<^VoK?iVb6ywj=R2n}nW5_rZm_vbIq>>F=CDIp_Kt6LHa`61 z?D>}O!?oJ|g4Tl&QppwGM^1DGg>RRyK)o%G*d@lffAg`=W8-sEMI~0k*9l?f6Ue<4 z-2{^JCjc!03|>{ea`X^dfVXrG@fW{fce@~5e+Es*lX9Qudx(SjT&-sX> ztovpKe>Hlt3G1lQH?IfZ;=^$tqD@z>YhMomD0=)V?iyJg#;Q`3mFAO8iEV_TVo*U~ zCCJ!Ku3U&}7npIxRVJ;&xO)Vn?)eM|Dt%o0BF6&Ty)d8i4hfKB0VLE$z0Y7i5bi{% zI%4SOFm+DRs=uH>2SW*05n(4;Gh*0W<8U*se`N0SpY*`~iluVecZ!ODRM(k1x=>p~ zq-!PeVW*xY+Ba!U*+PszoJH}~{38a4bP%z{e3}m+k9~ooOk$F9D=@+v?r^TffV}{l z$L!+$3J2d&7T!>N%ST5?!gYxE+L8Lz3{TXISkg2A>x(l4aCT|(AUOflXazLF9^@%u ze`B&Q31SI~o+&^ae^QQ{Ho~8S;Ti5E{&Dmo&ZhknAeAVO?~W8Oqj)Xse~z^ley#sR z|6Q}9#Jd$f4NGM@i8yoD99&F26HN)^#-pWga)U=yrAkom$D6y~Za@4UK!zwLN3O(wXZ*@0OWAu#L2U*DuG#IPtI1k@ z{K8%ndn;rxYhX$@_VNz-U$6S$@&+(XX+#a=;24~vgyqnu#~L^lC?*cb#f8k2!D-XX z6_)6r5xkH)Oedvrulhynyb^_LJwoMT<*vHVQbiM4=yAq-pb$m zomK}g<@Z&-&vvX&w9lRHVAz*FGk{)p3fq4$43FAp)WEg`pyG^X{;kSBjcon1!rByU zm^fIA$~Xd|8J;ii$U}pTzWa(({}Zqy&{9%L3>6pBuLEzQpLLOKWP3!)e<@v1bfU61 zVadYjDPKQkvYzcjf4l*+?$gFb^X{$F61Wx_ygE9|tmW0Lz}evxIZ9uy;w9=YLxPd2 z&7$%Rms4LC(QYB<|6}gkyW^~`yZ>%Jg%C7I)=1Xq;){fkugJm%Q(uT9*@Tg1BoCHm z6phBVEUODEG=V^yOK-U}e<5vCAWfSlElz;6%~}K&S!v(CpF#TnUHuC0@3;54KF@hZ zl1;k;UYhfq``PEd&)&Zs*pXWAH=W+!bFmL7M>D-oOkp-avSX^hCm%@bPXt4n>F;S< zE38*E3FSBtFCS_IU@lD9(BAPopB@fK+X$Ayeq=rn!&Cwe4GkT`_ff|EWTQ{X${&$*{t9f?9Qk>G6|IJwp&S7jBQ)qjLDn0ph;bfVl zOiXrE0<)}4pjVe7TxET-0+ozJZbM@Q#SBUHE|*TZjYe?k1l#~rK&!t^wG<~W=inYM z69g`c?QReQ#a()u2$wkm=YLTkgXMejR7UO{qVS5{7gpv}4xe%CR6%Wg+Fh&T060_f zkVQ78DSAG8Ih%y*E=3#fAh%NrQSdZ6Dpdb0q$ zT<&q$*Oi*WK`X>uu}W!fSFzZSnVp71v{QDF%!8JMyeZWao4Q`n>3@a$pOyF(8y5f# zES7~K^qyYJHF08wDE1!CV1jU*tESmvOomW}${`>}qBVnHZ#Xba86K(y%U#P48TGRU z(NuXPm&Bep(x8IImnTsRLS&QsH&Unw@{yF?3)54bkOwO&w%L^0n-3Yp=}N1H*8N)P&+|W8Cs4*4WVo<@s}Z(B;!2?asaTW+S{Eiz<(3ZMU@8Zi<5yr{FO>5i<{bUkw0)XJR0Wu>?tO#Ueu7X#x za3!CQ;Yt!yW`Dk5KtK4{jc>ZU5d-*Y#l(=O7fTJ92LVCkTWI2XE<>BB?e)e2WoCTK zn^133&cXGj2y7Bu0nfu?>MuH!F*e8yU9a4@qosBqFC8_Cbkd@OJagMU~$Dfcx8g)oVk7>x_l9ic#_ z*)Bc@{wIcmrI_WnUYUjg7uV6|<->9ptiya>D}8=CG7M%8N;E8Q@lb*ncOtyuxma$P z03>I$F+Nr8vfEg%+}Jpj+zrBK3*inKdt0cxm^rXb3d1_RY*=RAh)Cwnj&lAv&L!-C zd1V2bgny-6!9r!Zz67)oJbCcIJo)))oI(YVMkiBK(?dm8g)`#x%^OE%?`_9 z30ehtxmxj$q$jNY2CAVUBAMdlXICb!OwSs@*?%dmIpmt;NGq=r>p^6&FnnZ`FOm1K zxaoSq^C1DgC6GPLfE+ju{@dgm^ViOvf<86wq95_L*synP~gk@0ue$ z4u3&To2(E9oPaV9nwW=naPN8S*u9WwMd`5vZm}6VFhj7H9a84go`#W_FV8WKy`4HR z8=WFidj+CMV8#(h93PXyorEKMIi?>KC$LVKD07J@a~g#YF!J7oEy*B-o{-H8oJkFe zea(Fj8RMaqX(HTi%gr!XKoL;{8ZuyU<$sEeeFp4$y=>N7=dlmdl71&9(n5W;wZ^QK z8`=@f*RhUn)Lvn3F2PfXOU9G*hi|)d9;o;Llx+idcUEkqbj#4mP4#6u?s)ucH^7d_vPk*g* zt4Shwd#0g|t`Y%uP4?Y2RogW~cuO+y@(rUb-l=xb)7@+Bsu3b{N{4rM`fiqPoXsp5 zm}FdCo;lYDAd|gagdC)DK?xQ(Std5lWYFvgwt-+B$eir`0bcUa&xZh%j2G{)+!UCk zHfHSL3tAb6RE=f3h4ZdpGDd6h=YO(~csI^Q``z``8eTgCK_7mn*bsh~Yp;?&g^EHx zLf%IA_Y`W6m)P25)zz(;nf4CgZ)DL+#j@(HORM0U)~Y>_&XYqw_j#>3QeY47wZhx# z8B&fuL{Q}u=lKS$z90lbHlWI72&jwk+*N(R~^_!1Op2ey4pb4!$pDvyA9a6WwB96eq^45sIAx1 zjcv2yo+iaRpWHOYgY<5qTgesPwNGVeu}d49-jHHpZ%O7(V5&}4!WIOkTFy-Ic*`ok zRVEq`tr)NHbn%CL6t3_l%73GGr8Qs2vWT7$t%VTfz@FH3Jeg}0|6-$32D1ewHDP%f z1ucVe1x8jQGu_j})quugxPpzE=|w@u4d(0ah`lPxmY2;-8U@+|M66aJCqIDN$`vtQ zfI=z%&U44XDgXx}o-vw7!2)k4L})}RVo?( zgL*ZYnw?XxCevwWA%Ce{B9f_1e<}=RDi7)6F{i-V=1VE%+C>iyJcPmVqc$8UoN3ywLPc@o-x3Sf(yVnGoth> zs@&HmW_UW+4`}FS8!2f^6}u5YFw-=2b8n0_=8@y4rFY<_-+yXlT7}C%SOK>t;E|&K_kCVkIw!L*|fbJ zF6HyW+&nKX?pIrrkOr1O?gy)0YvZg7ri`y~dy$6#4 zyKd&0r9UV$+JCj+?=}F_4*e!9-mbmO;C=215qUI32_*91QxTIEQ}f^pv;BS7Cc6&w zT&?!?TphStJ{5oj@~TE{4FSAT1ZFmQus?Uplz>*@tH{_?fH z8u&l}oMH`Y;AVD@_NqQnuaBFoQ86rv1btN~gJFSZ%^kP_MVo{Mm%t3*U`3nLX5t$$qN%{>m8sS)% z6w)qCgFTOF#GglItAXlG{nY?Fh&4rGw*pK&QrmduaU8Ms7$H~=oC_o5$*#_G(<7dz zTYp!a1S@E3q*Z~{()psCKlm>$t*?OJE=E5yH(^c`E!8fb8G;H*EoEg2Xlm$_p()M= zS+ht+MUagLX%csGV||HYNr1woBuv~H1|2u;c!L)0K!L?Rl|%a2e@%6+BnDRJ-zciwLIBKQoiJr7CF4ZP`3qB>{wK z1?3`vgp;AHM`rPAc=nW{8isfDp z+mXd6uG`m%x#j=;QTejl=t1&G8vX9PHt*~)fP!&EtG#3x5ZlU5&ZhEevrTN&Gp zIfIR|3prYG-H=ggrC&3BF2i}UyT6Q=jDDc%<>gGRAwNuV-82ff3u`@4c>Px5-^q9s5%1=CuPu4e8<*4QiR0UfSh?FAto<3519f8GX) zKXNpz4k>y+${T7`NdqS4?5CU~!{Bv_Qk`_Adu4?oV$nkrDy^L=@MK9GL zWx>AQqyg#70&jeC)E?R0>U}YzBP=f7`-DiI*CEdVmeWzv=IA#s=V2q&cX$_m$R!v> z(E~p?BVa^zVv3M|)@cngdD-M(mB@3HWs;~Y2Dn{f5d>g1NSFaY#(!D|fMcWi4G8zV zY9}O8prS3vw+6YM0uxX;j@8Vppayj0fPQE+czCoJbFfalIE8v2bM?jZE8Oal3)lx8 zLWld8&2lgJ+(vO=h802MaupGA5L5&_ZK=r2J}NTQu_7b=GgfC!@4qV~O#8E+HA;W$ zSfq+mkM>muSRuRNnSU>1?R|pv-5k)lcQJl#1RijNVQanl7_*gCLuL`D2w%Ob1P2yEQstyHQvKdL!~q{ce&s zZfM&ARX)^6g4LrAG9teNU1~QHRhgWaT@hhV7+fRs9{2l4ntw`%b6K$;Jv@oc3@j1n zBKsC%{DQ|z93k$w)8x5A2^U^9nk&$5#V$mi=p98_7{)^eRL+UBR)@CswbImFEaJii znYs$Clt>&Q9ViK>qH%NjQfP(RP$bW`MCMk1^)f6OD{R-KW=5qzjfwKSs#Omvg)mA} z!+;dy)=P^E$bZ7f)K&$B%p_5wwkMjC-4r4m>c3EfrL79ZZunhx-{6-juN6s|r=IAV zbrR(6-HabSS?Drl9tnQs01!+>Xx%}6SK{$VC=OxzX;Qpi+OCmWnlejmD-&(B2DQ5^ zqyXB0nl9G*DFd&tcytOTax4}+16FVye4_e1M0-&ZM1MDM9l3aOv1hqY+6|@wavLH_ zIBJ6(`9aMjbTk2xm|80)un9=}Bi6jXr5d%T`k-s>q~q*}7FrOhUmGcMjg#zSDG4k*u*kq+coo|&Hn5UU7IyCU%cHVI1`%M0M~axSl4qP^=ecg03z zdHl)~W`8somirZA+3WkC?4>M;IHsPB0{4iub=?KD1zB%j38d`!J+h{Wlr3D!L=-zqtYqHAV z-hV;nz|N~*m(Tg8^Z15%00os_ufMFwIQ2zb_!cQ+vh(^q>vglH9$VI9UKmw7uYbvT z5VWIv=N*?zv)ZIq$Id%0ljh1&n~c7lcU(5jHL%gt?-lFUcG13Uy;@&n_ThbteHB`v zH*E?k8#N`$IA}mnbv}StEhv|p(k8BI)PI(+sXaUp|P_iqzmBLC^rKJ*3 ziKcW?$|x~9AsOz8_2)o+EFh>Zgm_F?I18qcV>FNok#wh=5JV;`8k7{y*SRSAY-)Wz zzCBLOWv+mi@bX;PkBlm7U5@Fwue-9r`iZa1MU6t`A_E4e*e^d)1!DAR95F9e4+|{tMl_s9iVVxcW_Qc)nmAG z>MzVH%a2Kh-D@KKHo7T*;^@J&zRalbR;&&j|$?Tx+2D;DKpV+WgKF8<{5JyCIfS2b8H3} z=vAKRVT9O21ISIj}#UVaBl=mCftyyW#tEO}w8 z(w{pMG$2=&IBa5q)ZIJkmI`>7Fh&_4>7mr1BSrXR+1tww&V<@mqF%$cR~1_$^Mm8A zf&>71+c+R^YaGI@FduTZ9SekfaP}QD((jOg^y*OK{wtbnl}xoP_Jip-L@42^lRbxaL2J0n=pZ;M5MhghiaH1DaowE28XtwqMg?7 ztX64Olj$XDG6|S*TsL?et1n<6uND=XrDX4TW7*N(AtV;bG+>#pE!SK1<&~w{WcE!a zBemQ+mNZ=%#lpf`AP(UlZo=5{>m${`RH-o5Q%gfMchXBMJb&1`5^}ws*-}FxjzD(6 zyuh4Xtt_w9s`?Nki)juG&=%zp0cjg-cep<rkqHa8b%Og3qe8(_#>HY1-2xTPM}t#KbK`B?B;3i z)B`MGD1^gu1%Ja#-VOYyv|bM_a#J)=U74xXDR;pv)HaZv|2fSIK%WmHm2NT(*;qvi zst^pZ)y_&YT~83v`TdVk(U-d0bkB;c7E2+;BsEemPReHxGIJ^fN|&9TP6Au)VEX`j z@M_@T<$ZBU2=fcQP2Nts^3z)@l%JlqPAtMzyN(ZxOn<`x)mQ(H!y{=V*vS>)mgMXQu?0{PH*6y8jXLg)8CHVS>I4>JmrqHMXr=y#&5P+P}UT*S1zY}@ZhSI0tK znZ8`D)LXTT2C4JwtmkMzO^m+jJ|ulJ8o?0;3lExEU@u?CNbm;zo)nQRICDs+iX3UU z_;#B3@_+V~dFO_}_Z}92eeK=_Vt~Rvz|CHz!YSuLi zF}vO<@r1=;N4M1%pg?P^;LsvFiWQrb0C5s4#BYPv3q+zonWhyr@}|^iTZ)^KzV5NKn}DRXYkIl;jYyQ zwSSI+R*ThOSZr#p(hTxqO)X^g^xnPOJqNx8tVO8GN(TK^_$$2>mdJ4t=8%dvMcN}l z>$6e3!iJbH?4T85RAy56tZuQW1PlI?FhrdNVGZ(wjtRheI)+1sDtQ2KN-WuMzuK7e zVg=WzSay9LAiFxp{E?04X`{HyD_jPAT7M-{gD&rk+u$S=oYgahIA+g0F*qp(9tW_f zKwAe>RAc}KDl02X>pfv=vH&AwWOU3}7-6>3r+OuLudl83%RKJWx|0)~iy6f;LbfpO zi)a?0yRM%GgS(y)6bc17+1TfaM2OdwdV|+BKF8dK@Te|o_)=L?`?=6;u0au{=6}Q0 zaHCjsB3H`=$lJTP&Ux4I3sa}{rKoMZ&d_nKbUur>hk*;v#Rgb-?<#z0s zBK=#wj_?m`mx#%7r!*Vb^d=y*yVX(Ime$b6t4|oa;?BO65yotT;Ve*#Rbi?(0qiK& zY(;Std{{oFI586Ooa^pICDe1n)yeRmjSLex_!3DUF7h`IL%VQ;m!SZQ1Ah!rfgGrT z%GW8*?A$2X^S#)^r3ZOKY{+~|?hXgX$mWC*<2$swIX}eJIVeO*mogHUb613Y4apY4 zXOWV0|6W1rB?nNbS?y!(x)=py8wYcUBSSi!@;!Ol5xW zWhU?a1e}-zQfO`tzz!Uj$bUWF!P0zRuqe%76G>b@gtLs^+e%xbzjh01)1L9hF4qB` zNhJzI1uWwFxYdWLSD&yNE_Db6!}L~B;YM^82ib!oSb1z^vE3W->Aret1II+z4leP4 zE~jLGX;-Q@`H-fc%bT?QK$=c3R~GSXrM7OI3}AbF=E;1{f>PB}>3^H)w_$KtVL=OM z>+*1MD#*^B&pC$C0~D&}A${4H#rf1GUx~efrxbId;#JIHD`H;ex#Rt=yZctWg!gXf zhix;-Wo?oXXvOsljEh)1Pgis}N)nOpgdvxKLkEDmYlyXOl?t68_}+N~cRJGMieV03 z%v|PHac<+9bZ!^!l7E@zvlmP2vU-JL?Hv8HUSt{v_=8=I%e7PCC}k!KR>9n(~t7Ds^C{pw*gky%(!ZWjZxFWaf1gV=Eg5 zf6afHoU`A7?_RX_c0o>i$TKVUSu!D7Lp@nmx-#3V)0CFe?g+mE`#~T&T}Y zLq}I4L)9T?WhfAT@Nh8xxW9`AcjvsZM*9;nSPg2XKY1eNo3bB3v#^R{9-J`9Nm=z# znPTi@I$~xaW}o$RK|?->h%Xga(Le>c*epv^%{O*5?b1|LKsy6Yo(;f=%r-3h7A1tl zQY%G58Xkw->VN)nC)4+8*(JRuXUddW18TzAxA#C0W(p7SG+khi%VBY$5RwI0NaVJs zN@#t(I``z2VP7zTn0&$J0nr|B6ILUILcNnKqmqS?%B2p0nhmGm5$x{mIR`O_1%nwiBX?cdh_+gQ^68C7AU@@n=Xr=vGxf2rgEjvN$FF&moCD2_7oS5z{xAP z9-kq>fM-`IMwfi zd!ebS!IJTbiGd@&24TGM6N0m|+&6KeG}AwELYp+dHqk-HX~rlC3bT6bT&Oa5F?E=1 zT5Eb%yU--uyf*&y18QK+uQ`K)MsGAv)H1Yf42iuikHj(z#gb!h0f*9@?a~n}E`M&-NblS}O)N;~6g68D5#l)kG|_IVdy3AS zgo*tGsRrtu4TALri|(bH9?qyaSoJ`!3z_j`BS|esPm3RiN<5f(;lc^zJag1Zt?qa4 zW>N$#XF&3_Qjn27qf7zCG+9qJ9@V0lP34qe;+=OPj37MRtCZv*3VR#Qs2>St@ zP#`b`q&>M^`h%L-5$X&!Nl^SM0d&3FlfzO$>&%~7^YjP=} zvl)mETBS{=0cjCuh9@?b>)0=~9i`>Qn3bt6i9EXd3=h_#g+`vmE~sJWS0VU{ZgL6s z*F#hj&GwL$wJ=?Q-NIDeu|a?jaA|l}S6RGc?9wsHto2QEDi(WfnzIVkp?_(KMnp}p z6t8>UG^072>XWG_Cq*ry9f7ffvxEIduNf3vEvYw|2K+clpmFr%N?r&ocBP_oiZFUA zjGcq9lA(9q(mVC=71CQCy5#yX1ua0nbqcoy8<2I0${jg2HX!(b*>@)Ef?mWX;~B>v z;zwR~g+=3Xk|;*SLgSkd>wlL@6Wt&cnIbV7uu5E+%Hl;E(%WvoG6miQeay2n3qo%T zY(l1oovGZS(dDIhBU3nho$SO}G*DvT;>;E=qk3qv7yPNhW_2uL$~D$1@kH2Hk6&)w zXf$uBc2gH_Mv|z_u~}sEt*55Suw#=tB08%IYH|We}ABI)_ZMku~5ka zG%&mVP6P4FzF{q;8o~Fg=*=oAD>yp%`+TDchwfU1Y{k7VwreyW;OP~HP^x@-8qAG? z`h^JRSP^!OimFLW{$C^>XIoYtl%_d?-2pq=FonCvZZ-jEJZPE3(D2gr8xF@fUNC-e zSU(u^P(NDrFn(qQc7N$NOSgPMb2bw=Y=UK2JP!PVY{p zagh4!bcQi`2Uv7YA|`(BmU~QAp~-vhN8o%9wrqmZazx8p)9NH^oKXvJ`x|_@S3V&HRv%EF9!6^)P@KM4dnzfWgR8ykkIVu? z8-7v0@q2%g@hVOegR^QzT{R4wYS-L49X!TyHV8+*R)3Li0dS=apeik?)>*;W4uCe- ztE4Yt;S}RHVvRZZ;=&b}3Y98E>lD)iA==trjGr0fIWbfkIIHZ=TdS<9hi@}y|GEID zxorTODa_J!GWckA3xYR{ z=jFF|UjF<3|M%vv{`-e7_y6~=esv8InFW;l5qs9x^#tn5&yzvb!>Ft*XUNs7Fp=}Z zwSV4$1Cv*;jyP}l9Z+hRpiv_2qgtoW_NwEuw7+*;F0fq}r(xZLVE65L+;H__9U$Hm z5npMqO_-y-_*jhm$Wv^sfh4utkv9hAJO)Y}SF?m3L6jSAzL|^yDnEC&)D7|UOr{&% zzu!Gp9ubB^d3bXZd7ad-paC}a?Ao=7+JEL!?RJk2bJW091|o5p!o5)xdsMfE=);TX zO+Xo(KK~S@h;MZ@D?Q8vM?46^TWPvXu7L$3axM85%4dEN9|GqlYt-m+LqWE&1kT%c zYc+0!83%JQS}l|M@O|6jb-xs55@W2~Z ztE{@BAbS*A5k-|qF?*fPf!P0dop_5^Z5o4v&HP@zkOEB@nYBy0)XQ7Vgi>|d^o(&>K zlmNAIqUpSy#EGM}#XpLf>WU!M!>B~(FDflK@x;`0&b!j1OvA> z_aOUh4avQZx-cJM6RKK!&VOw(YRj5wo6Q^^TyZ&EY4zJIX>NcLjYiCY$CB;oc8hp> zNKhXGs}V{nq!;@nlOP?nzs!8F#QLW^4Dr1t9mk^{&_{D5qOdcw%}@%3bQmR;lKvYz zlgMlQ!P$wqVFh7nG_m&6fTXY~MBL5b0k45Yg&`~QkQld)8Ch>(tA9iavJP>W6}G87 zMcSw$!gy}4FC(eQe~_>SUS!>b7SRl>3;vb zYsBssp@xO;xMU3`wzDvdyL-uK5a=*g{3OVId<|f|?hVCf$Fpf>i`CY?*agHTv7HNE zittVe+LXu^@P9~BOH^NUnUp0C6};(LjZA@LdyaiFcYw8K4k<7J8@x3d$^yfN_P)#l zRM za`~y(?8h&QlF=yApx)|mIY&FVj1fc_{*rm)2&lqUbALe6!+COkwJC=ZXJe;sr!quV zxl*61E#R-v`HIFkYMxiq&xHo;K^G8gX9t=6d*E(U4NWVXc{N#Z6AwKf>_|jLH-guVj>>@ z*?2fcx1doa5t0jix!1puV3;&oO)$)We~U$D6(ap02U#wp7)r7wJma|ZOT zdw;`jr!Z8iK4ydcL!(^S)&m#Y)od4^Q{~6~`#B=XI z?fkzP`JZ$C{}TB>u=RA89?uROsb$T%P!a6@i0Xz-_wewM&BtvXHBO`AJdkigQhF~L zdONTEAgG8cxiK&8oY#I3dq_S8;W}Lsob|?L&=m)bdD!xD8(=?)3cUK7tpJ>6&0ra( z2H)yX*o4}p-RLjZmQkifg4!4y7=MAs#$(}p0|Ewizs6(fO1$~%e#DJTUA{0f8+GIL#$V~oUYdt2GoR(uR|(o)jeV6QOPRRZtfeUKhd(k%{&t5_ zUm~aHb{h%$@AqiSA{`b~FTcPXvtgEMO)hn^vJ4HEv2oh9#Wg0ChY3sOQT2JarYG{S zVaYteIC6P(sD}+p=7C{r^MAOBjf3_S5Q?Lcd@0P>#^4IJ`|>FM+D?C6iLOa(`pW85 z1&2&GZ0bB5h?iSN5$9BGX)O~+yuvt_uGg}0#4C(*uChE=%|ueaFxC}F$A&Sk)Wgu{ zGNI2!p%?3m+17|x80YdztG?7U_P=c`;~AB7JzLcExF)S^cqa~_=+!yJhsu|cjbE**RL2?PQ;U(q<1tLpgXNB`m4Uh1oWPKo%(cKq)O+;T$L|;!v zU;WY7f#_>2`Z83@_!;{~v8S%h`ew6d39i+Uch<|j3o{92l8!4C4SV_wJ zr%M;hNjYD&p%i-`$2B|l!4wNdVJ0vhqKTAlI3^Mnh1ql+OYIl|cWbbQ6WJyMt~Z-E z3bTR>$K-7G!2QEnHt&lCBjXDn-WFm{JYwWMZx`0#b`hmGh^L*OuEN&pq4yDUE}2Lv zucTkXQ=B<()_>)S^Sf=CLAo*Lnbal-j~skfP@dOuzScYyvQeF2Hy<}|Oi!*&+@K)x z7%>P07Gtn9(yRPsE+TjDN<_ov5Y>5Smu^;+u%ID|E($pz1H$0wl_w`#D-2UT45CFINcuai8HDEx0uo^;;&G%2iOH;(qb`L%z@bjY- z#8}c}magXV;rK{qZQpOMcs?Rmv?TUBJ7iBjL2^{>Ug=|!)@G|;QMJuh5S|X{RWFMo0&YBBSA=1C6`aK_L@&v4Hw_lIfxGK zxQob685pfCQ)bBqf`Gd9(8a(^AAPZ1F?}Q2OvSu*Vd}C`2GhY^HafX` z9xYS2QM0yj>Q-- zn1+TAdj^xls(lgM_y;VqJiuAyd6{8Bqa(f`0+R1qkR`zWiWxp^Ul;-oo@v-aw#)nX zdrrJBd${5%Q69>*^`V-&OUXg2p(HO8CB zWGy`rG-=0Hg|Z*nw7a#>r1k9HTn@Xo*A{R2!f_6GuvlHwLu76o9Dj3xn;Sc?dD*F@ zPH9{q)4b!y+G0U`31HlxMR8kDl1WK+Nqh;+q**G;p8<^f(YN zfY&h>h%~SMFq4MxT1!iW0xn8uV=(nwL25j8Lb0NtSuPZ*zWQpIYNcZ;6UtPth2u!` zOA8Sl3&WZ6tI3qRCVyqxAHN=_Wqmr8YQkA+I_0iOrPMe4lEPT2G!n46@sS@?FBL_V zfX$8H{KC(PC1LZ^;!#pqi7;v#P>?lm$Xt?VgL4f6?GpRuTY|sO-?Nt5^N=c!#FF^?#+HX`M1nwF_@21uc8) z71y%2zN9TPIM3?$iZ(0oGshA=yLo0`Sdl>L&K7DJWFm2fH|!B`Qlv_S9)o1Igg%3Z z$*kJMZcK^LO?weF?YQsTabK&ZwY0nptl#FL@fJ88eB-krFlePaKsMx*O_K}38mD|; zI`H=7&a}qcJby`92_TzK8>>~2OWT%wIrhJ7<7fPz^(3F$c-D=E3!%4_CB1reCiM1l zp%rXqhKE@rw2=1&*jPT`_!49@{b6Z&gA6evVHyhHAjFE>f{zT141N8E!_S42fq)C* zUkQXpvC8QQnyxmHiA)G2-!;TE#EQi7{7RGHdf?3ko`0$=E*j=pwW@C0-SrAk2CAB= z{Rm2=289ZIh>LdE?Dab%$AczVNn!-Xw6FrYRFH#AndV2Ad%MD@)%s;#*ln;K8^kEi z2cNO2)4|tkl~obcvd*A^SgMT;vT1kr#~IAXrNV{g8}I36+$t;ZjA(e7Ht{{{b!8=r zebz7VVSiiWi4-<73W4>sf{>-v&ZJvf?M%CKE`2)unReI;E3}Q{L1isF8vP`EHcI6E z&o9^UW)lsX^ZvXIDH6gw$}5ClL`?1SHFFIUQ0FM+cKzieFS z5Om!y`e?^<{?4ZoOZqzNJb_Fgo)?`D&|{4Bm47;eQ7PE}Mn=s~IeZz*A?C#916FK+ z9?IOqNG0lsOTp_VmMgo&;fN#OHt<#rko0+fnH2S}UDNTxd#-}~;mGVI~V$Ggi2cK`D0 zIe%0Oa#6!9b186Ma5L0Kl=ljbPQad+CBtOOSa)4;uzd>Z;xOS~)&i@4Dcvz@Cnj~u z3jEbTY2LpA|L~a=b%ouM*zf99w!ry?mcq^%lPjn&)DnAELY*#TSE|q=xGK7R z_Oo#)GcGj^EXVr9jA*3I5T@0Dix^W7D}P=;c*?2B-6hu0N~pTDPW^48VP0 z4V>=VvRCDbYHi2mF&UWC!%}DOE9O(_>vY#JBJCJ9$vA?8;mNZkUSM$(1?-f-0Wo5=2^RBZKbJo@ONk)xwE5ZY3{F-J!h78XWx5%Y~1I#jDx^AYon87x=J^W|K`d}9X7m7_=VIe+FGGc