diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1e2429364..03cde0a48 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1379,7 +1379,7 @@ jobs: id: pack_artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | - zip -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework + zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b2a1845e..23cfbce5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,8 @@ else() set(LLAMA_STANDALONE OFF) endif() +option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF) + if (EMSCRIPTEN) set(BUILD_SHARED_LIBS_DEFAULT OFF) @@ -145,7 +147,13 @@ endif() # 3rd-party # -if (NOT TARGET ggml) +if (LLAMA_USE_SYSTEM_GGML) + message(STATUS "Using system-provided libggml, skipping ggml build") + find_package(ggml REQUIRED) + add_library(ggml ALIAS ggml::ggml) +endif() + +if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) add_subdirectory(ggml) # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() diff --git a/cmake/common.cmake b/cmake/common.cmake index 0f54871e4..a5bb787f1 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -1,3 +1,5 @@ +include("ggml/cmake/common.cmake") + function(llama_add_compile_flags) if (LLAMA_FATAL_WARNINGS) if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") diff --git a/common/arg.cpp b/common/arg.cpp index fe6a1eece..b6bfe6f89 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -853,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-sysf", "--system-prompt-file"}, "FNAME", + "a file containing the system prompt (default: none)", + [](common_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.system_prompt)); + if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') { + params.system_prompt.pop_back(); + } + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", @@ -1875,7 +1889,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.out_file = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), diff --git a/common/common.cpp b/common/common.cpp index c7cf66545..92f2c57cc 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -998,6 +998,8 @@ struct common_init_result common_init_from_params(common_params & params) { if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + llama_set_warmup(lctx, true); + std::vector tmp; llama_token bos = llama_vocab_bos(vocab); llama_token eos = llama_vocab_eos(vocab); @@ -1030,6 +1032,7 @@ struct common_init_result common_init_from_params(common_params & params) { llama_kv_self_clear(lctx); llama_synchronize(lctx); llama_perf_context_reset(lctx); + llama_set_warmup(lctx, false); } iparams.model.reset(model); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b5d95bd56..d13d57c54 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -908,6 +908,40 @@ class Model: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_rwkv_world(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = [''.encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + parts = line.split(' ') + assert len(parts) >= 3 + token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.chat_template = "rwkv-world" + # hack: Add '\n\n' as the EOT token to make it chat normally + special_vocab._set_special_token("eot", 261) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") @@ -3412,38 +3446,7 @@ class Rwkv6Model(Model): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) - - tokens: list[bytes] = [''.encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] - - with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: - lines = f.readlines() - for line in lines: - parts = line.split(' ') - assert len(parts) >= 3 - token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.chat_template = "rwkv-world" - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - special_vocab.add_to_gguf(self.gguf_writer) + self._set_vocab_rwkv_world() def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -3565,6 +3568,168 @@ class RWKV6Qwen2Model(Rwkv6Model): yield (new_name, data) +@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(Model): + model_arch = gguf.MODEL_ARCH.RWKV7 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def calc_lora_rank(self, hidden_size, exponent, multiplier): + return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32 + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + try: + head_size = self.hparams["head_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + except KeyError: + head_size = self.hparams["head_dim"] + layer_norm_eps = self.hparams["norm_eps"] + hidden_size = self.hparams["hidden_size"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) + + # ICLR: In-Context-Learning-Rate + try: + lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + except KeyError: + lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + lora_needs_transpose: bool = True + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # unify tensor names here to make life easier + name = name.replace("blocks", "layers").replace("ffn", "feed_forward") + name = name.replace("self_attn", "attention").replace("attn", "attention") + name = name.replace("time_mixer.", "") + # lora layer names in fla-hub's impl + if "_lora.lora" in name: + self.lora_needs_transpose = False + name = name.replace("_lora.lora.0.weight", "1.weight") + name = name.replace("_lora.lora.2.weight", "2.weight") + name = name.replace("_lora.lora.2.bias", "0.weight") + + name = name.replace("feed_forward_norm", "ln2") + name = name.replace("g_norm", "ln_x") + + if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: + # some models have dummy v0/v1/v2 on first layer while others don't + # ignore them all since they are not used + return + + wkv_has_gate = self.hparams.get("wkv_has_gate", True) + lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"] + + if bid is not None and "attention.x_" in name: + if "attention.x_x" in name: + # already concatenated + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = data_torch.reshape(len(lerp_list), 1, 1, -1) + yield (new_name, data) + else: + try: + self.lerp_weights[bid][name] = data_torch + except KeyError: + self.lerp_weights[bid] = {name: data_torch} + if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0) + yield (new_name, data) + return + else: + data_torch = data_torch.squeeze() + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if self.lora_needs_transpose and any( + new_name.endswith(t) for t in [ + "time_mix_w1.weight", "time_mix_w2.weight", + "time_mix_a1.weight", "time_mix_a2.weight", + "time_mix_v1.weight", "time_mix_v2.weight", + "time_mix_g1.weight", "time_mix_g2.weight", + ] + ): + data_torch = data_torch.transpose(0, 1) + + if 'r_k' in new_name: + data_torch = data_torch.flatten() + + if bid == 0 and "time_mix_a" in new_name: + # dummy v0/v1/v2 on first layer + # easist way to make llama happy + yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) + + yield (new_name, data_torch) + + +@Model.register("RwkvHybridForCausalLM") +class ARwkv7Model(Rwkv7Model): + model_arch = gguf.MODEL_ARCH.ARWKV7 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + hidden_size = self.hparams["hidden_size"] + head_size = self.hparams["head_size"] + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + wkv_has_gate = self.hparams["wkv_has_gate"] + assert self.hparams["wkv_version"] == 7 + + # ICLR: In-Context-Learning-Rate + lora_rank_decay = 64 + lora_rank_iclr = 64 + lora_rank_value_residual_mix = 32 + lora_rank_gate = 128 if wkv_has_gate else 0 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_token_shift_count(1) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 5da439e94..184cd4195 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -660,8 +660,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |--------------------|---------------------------------------|---------------------------------------------| | GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | -| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | +| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | @@ -671,6 +672,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase | +| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | diff --git a/examples/main/README.md b/examples/main/README.md index f7c249729..e4b3590b5 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -27,12 +27,24 @@ Once downloaded, place your model in the models folder in llama.cpp. ##### Input prompt (One-and-done) ```bash -./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time" +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time" ``` ##### Conversation mode (Allow for continuous interaction with the model) ```bash -./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma +``` + +##### Conversation mode using built-in jinja chat template + +```bash +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja +``` + +##### One-and-done query using jinja with custom system prompt and a starting prompt + +```bash +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello" ``` ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): @@ -44,12 +56,24 @@ Once downloaded, place your model in the models folder in llama.cpp. ##### Input prompt (One-and-done) ```powershell -./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time" +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time" ``` ##### Conversation mode (Allow for continuous interaction with the model) ```powershell -./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma +``` + +##### Conversation mode using built-in jinja chat template + +```powershell +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja +``` + +##### One-and-done query using jinja with custom system prompt and a starting prompt + +```powershell +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello" ``` #### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): @@ -77,6 +101,8 @@ The `llama-cli` program provides several ways to interact with the LLaMA models - `--prompt PROMPT`: Provide a prompt directly as a command-line option. - `--file FNAME`: Provide a file containing a prompt or multiple prompts. +- `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)). +- `--system-prompt-file FNAME`: Provide a file containing a system prompt. - `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) ## Interaction @@ -89,7 +115,10 @@ In interactive mode, users can participate in text generation by injecting their - `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model. - `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. -- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false) +- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found) +- `-no-cnv`: Disable conversation mode (default: false) +- `-st, --single-turn`: Only process a single conversation turn (user input) and then exit. +- `--jinja`: Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false) - `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. @@ -125,6 +154,8 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t Example usage: `--chat-template gemma` +`--chat-template-file FNAME`: Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py + ## Context Management During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations. diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 91edd741c..a61982dcf 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -79,6 +79,7 @@ class Opt { ctx_params = llama_context_default_params(); model_params = llama_model_default_params(); context_size_default = ctx_params.n_batch; + n_threads_default = ctx_params.n_threads; ngl_default = model_params.n_gpu_layers; common_params_sampling sampling; temperature_default = sampling.temp; @@ -104,6 +105,7 @@ class Opt { ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; ctx_params.n_ctx = ctx_params.n_batch; + ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default; model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; temperature = temperature >= 0 ? temperature : temperature_default; @@ -116,12 +118,12 @@ class Opt { std::string chat_template_file; std::string user; bool use_jinja = false; - int context_size = -1, ngl = -1; + int context_size = -1, ngl = -1, n_threads = -1; float temperature = -1; bool verbose = false; private: - int context_size_default = -1, ngl_default = -1; + int context_size_default = -1, ngl_default = -1, n_threads_default = -1; float temperature_default = -1; bool help = false; @@ -159,53 +161,94 @@ class Opt { return 0; } + int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) { + if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { + if (handle_option_with_value(argc, argv, i, context_size) == 1) { + return 1; + } + } else if (options_parsing && + (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { + if (handle_option_with_value(argc, argv, i, ngl) == 1) { + return 1; + } + } else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) { + if (handle_option_with_value(argc, argv, i, n_threads) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { + if (handle_option_with_value(argc, argv, i, temperature) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) { + if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { + return 1; + } + use_jinja = true; + } else { + return 2; + } + + return 0; + } + + int parse_options(const char ** argv, int & i, bool & options_parsing) { + if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { + verbose = true; + } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { + use_jinja = true; + } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { + help = true; + return 0; + } else if (options_parsing && strcmp(argv[i], "--") == 0) { + options_parsing = false; + } else { + return 2; + } + + return 0; + } + + int parse_positional_args(const char ** argv, int & i, int & positional_args_i) { + if (positional_args_i == 0) { + if (!argv[i][0] || argv[i][0] == '-') { + return 1; + } + + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user = argv[i]; + } else { + user += " " + std::string(argv[i]); + } + + return 0; + } + int parse(int argc, const char ** argv) { bool options_parsing = true; for (int i = 1, positional_args_i = 0; i < argc; ++i) { - if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { - if (handle_option_with_value(argc, argv, i, context_size) == 1) { - return 1; - } - } else if (options_parsing && - (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { - if (handle_option_with_value(argc, argv, i, ngl) == 1) { - return 1; - } - } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { - if (handle_option_with_value(argc, argv, i, temperature) == 1) { - return 1; - } - } else if (options_parsing && - (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { - verbose = true; - } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { - use_jinja = true; - } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){ - if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { - return 1; - } - use_jinja = true; - } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { - help = true; - return 0; - } else if (options_parsing && strcmp(argv[i], "--") == 0) { - options_parsing = false; - } else if (positional_args_i == 0) { - if (!argv[i][0] || argv[i][0] == '-') { - return 1; - } + int ret = parse_options_with_value(argc, argv, i, options_parsing); + if (ret == 0) { + continue; + } else if (ret == 1) { + return ret; + } - ++positional_args_i; - model_ = argv[i]; - } else if (positional_args_i == 1) { - ++positional_args_i; - user = argv[i]; - } else { - user += " " + std::string(argv[i]); + ret = parse_options(argv, i, options_parsing); + if (ret == 0) { + continue; + } else if (ret == 1) { + return ret; + } + + if (parse_positional_args(argv, i, positional_args_i)) { + return 1; } } - if (model_.empty()){ + if (model_.empty()) { return 1; } @@ -232,6 +275,8 @@ class Opt { " Number of GPU layers (default: %d)\n" " --temp \n" " Temperature (default: %.1f)\n" + " -t, --threads \n" + " Number of threads to use during generation (default: %d)\n" " -v, --verbose, --log-verbose\n" " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n" " -h, --help\n" @@ -260,7 +305,7 @@ class Opt { " llama-run file://some-file3.gguf\n" " llama-run --ngl 999 some-file4.gguf\n" " llama-run --ngl 999 some-file5.gguf Hello World\n", - context_size_default, ngl_default, temperature_default); + context_size_default, ngl_default, temperature_default, n_threads_default); } }; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3450dcc5a..80daec979 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1868,6 +1868,10 @@ struct server_context { params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; + // force F16 KV cache for the draft model for extra performance + params_dft.cache_type_k = GGML_TYPE_F16; + params_dft.cache_type_v = GGML_TYPE_F16; + llama_init_dft = common_init_from_params(params_dft); model_dft = llama_init_dft.model.get(); @@ -1888,10 +1892,6 @@ struct server_context { cparams_dft = common_context_params_to_llama(params_dft); cparams_dft.n_batch = n_ctx_dft; - // force F16 KV cache for the draft model for extra performance - cparams_dft.type_k = GGML_TYPE_F16; - cparams_dft.type_v = GGML_TYPE_F16; - // the context is not needed - we will create one for each slot llama_init_dft.context.reset(); } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 36ad276fd..58cdd6af9 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -621,7 +621,9 @@ static json oaicompat_completion_params_parse( llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; - llama_params["grammar"] = chat_params.grammar; + if (!chat_params.grammar.empty()) { + llama_params["grammar"] = chat_params.grammar; + } llama_params["grammar_lazy"] = chat_params.grammar_lazy; auto grammar_triggers = json::array(); for (const auto & trigger : chat_params.grammar_triggers) { diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index 9f510d004..4ce1b567f 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -87,11 +87,11 @@ struct wav_header { uint32_t data_size; }; -static void save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { +static bool save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { std::ofstream file(fname, std::ios::binary); if (!file) { - LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str()); - return; + LOG_ERR("%s: Failed to open file '%s' for writing.\n", __func__, fname.c_str()); + return false; } wav_header header; @@ -108,7 +108,7 @@ static void save_wav16(const std::string & fname, const std::vector & dat file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); } - file.close(); + return file.good(); } static void fill_hann_window(int length, bool periodic, float * output) { @@ -536,6 +536,7 @@ static std::string audio_data_from_speaker(json speaker, const outetts_version t int main(int argc, char ** argv) { common_params params; + params.out_file = "output.wav"; params.prompt = ""; params.n_predict = 4096; @@ -1061,8 +1062,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 } #endif - const std::string fname = "output.wav"; - const int n_sr = 24000; // sampling rate // zero out first 0.25 seconds @@ -1073,12 +1072,16 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14 LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f); LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); - save_wav16(fname, audio, n_sr); + int retval = 0; - LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str()); + if (save_wav16(params.out_file, audio, n_sr)) { + LOG_INF("%s: audio written to file '%s'\n", __func__, params.out_file.c_str()); + } else { + retval = ENOENT; + } llama_batch_ext_free(batch); llama_backend_free(); - return 0; + return retval; } diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 9a4ee4992..740f9f69c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -186,6 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP" option(GGML_RPC "ggml: use RPC" OFF) option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) +option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING diff --git a/ggml/cmake/common.cmake b/ggml/cmake/common.cmake new file mode 100644 index 000000000..1976d0ae9 --- /dev/null +++ b/ggml/cmake/common.cmake @@ -0,0 +1,26 @@ +function(ggml_get_flags CCID CCVER) + set(C_FLAGS "") + set(CXX_FLAGS "") + + if (CCID MATCHES "Clang") + set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) + set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) + + if ( + (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR + (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) + ) + list(APPEND C_FLAGS -Wdouble-promotion) + endif() + elseif (CCID STREQUAL "GNU") + set(C_FLAGS -Wdouble-promotion) + set(CXX_FLAGS -Wno-array-bounds) + + if (CCVER VERSION_GREATER_EQUAL 8.1.0) + list(APPEND CXX_FLAGS -Wextra-semi) + endif() + endif() + + set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) + set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) +endfunction() diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 2e5076d36..cb3edb10d 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -454,6 +454,7 @@ extern "C" { GGML_OP_RMS_NORM, GGML_OP_RMS_NORM_BACK, GGML_OP_GROUP_NORM, + GGML_OP_L2_NORM, GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID, @@ -502,6 +503,7 @@ extern "C" { GGML_OP_ADD_REL_POS, GGML_OP_RWKV_WKV6, GGML_OP_GATED_LINEAR_ATTN, + GGML_OP_RWKV_WKV7, GGML_OP_UNARY, @@ -1095,6 +1097,18 @@ extern "C" { int n_groups, float eps); + // l2 normalize along rows + // used in rwkv v7 + GGML_API struct ggml_tensor * ggml_l2_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_l2_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + // a - x // b - dy GGML_API struct ggml_tensor * ggml_rms_norm_back( @@ -1890,6 +1904,16 @@ extern "C" { struct ggml_tensor * state, float scale); + GGML_API struct ggml_tensor * ggml_rwkv_wkv7( + struct ggml_context * ctx, + struct ggml_tensor * r, + struct ggml_tensor * w, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * state); + // custom operators typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 52817510f..a797e2b18 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -1,4 +1,5 @@ include(CheckCXXCompilerFlag) +include("../cmake/common.cmake") add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES}) @@ -24,33 +25,6 @@ if (NOT MSVC) endif() endif() -function(ggml_get_flags CCID CCVER) - set(C_FLAGS "") - set(CXX_FLAGS "") - - if (CCID MATCHES "Clang") - set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return) - set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi) - - if ( - (CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR - (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0) - ) - list(APPEND C_FLAGS -Wdouble-promotion) - endif() - elseif (CCID STREQUAL "GNU") - set(C_FLAGS -Wdouble-promotion) - set(CXX_FLAGS -Wno-array-bounds) - - if (CCVER VERSION_GREATER_EQUAL 8.1.0) - list(APPEND CXX_FLAGS -Wextra-semi) - endif() - endif() - - set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE) - set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE) -endfunction() - if (GGML_FATAL_WARNINGS) if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") list(APPEND C_FLAGS -Werror) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index b2d857e1e..6bb5d0834 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -2790,10 +2790,14 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset); + int64_t antiquantGroupSize = 0; + if (src0->ne[0] > QK8_0) { + antiquantGroupSize = QK8_0; + } ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, + nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor, &workspaceSize, &executor)); if (workspaceAddr == nullptr) { workspaceAddr = workspace_allocator.alloc(workspaceSize); @@ -2833,7 +2837,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( acl_input_tensor, acl_weight_tensor, acl_scale_tensor, - nullptr, nullptr, nullptr, nullptr, QK8_0, + nullptr, nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor, &workspaceSize, &executor)); ACL_CHECK(aclnnWeightQuantBatchMatmulV2( workspaceAddr, workspaceSize, executor, ctx.stream())); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index b8d272cda..68cd9920d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1689,11 +1689,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, case GGML_OP_MUL_MAT: { switch (op->src[0]->type) { case GGML_TYPE_Q8_0: - // Current groupsize should not be greater than k-1 in - // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize - if (op->src[0]->ne[0] <= QK8_0) { - return false; - } case GGML_TYPE_F16: case GGML_TYPE_F32: case GGML_TYPE_Q4_0: diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index d6c4a9c29..6aa078a93 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -287,17 +287,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name) endif() endif() endif() - elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ") message(STATUS "PowerPC detected") - execute_process(COMMAND bash -c "grep POWER /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER_M) - if (${POWER_M} MATCHES "POWER10") - list(APPEND ARCH_FLAGS -mcpu=power10) - elseif (${POWER_M} MATCHES "POWER9") - list(APPEND ARCH_FLAGS -mcpu=power9) + if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + file(READ "/proc/cpuinfo" POWER10_M) + elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc") + execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M) + endif() + + string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}") + string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}") + + if (EXTRACTED_NUMBER GREATER_EQUAL 10) + list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64) + elseif (EXTRACTED_NUMBER EQUAL 9) + list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64) elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native) else() - list(APPEND ARCH_FLAGS -mcpu=powerpc64 -mtune=native) + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64) endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") message(STATUS "loongarch64 detected") diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index 8c7dbd1cc..4e0ae0572 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -8158,7 +8158,156 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const int nb = n / QK_K; -#ifdef __ARM_NEON +#ifdef __ARM_FEATURE_SVE + const int vector_length = ggml_cpu_get_sve_cnt()*8; + float sum = 0; + svuint8_t m4b = svdup_n_u8(0xf); + svint32_t vzero = svdup_n_s32(0); + svuint8_t mone = svdup_n_u8(0x30); + svint8_t q6bytes_1, q6bytes_2, q6bytes_3, q6bytes_4; + svuint8_t q6h_1, q6h_2, q6h_3, q6h_4; + + for (int i = 0; i < nb; ++i) { + const float d_all = GGML_FP16_TO_FP32(x[i].d); + + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + + const int8_t * GGML_RESTRICT scale = x[i].scales; + + const svbool_t pg16_8 = svptrue_pat_b16(SV_VL8); + const svint16_t q8sums_1 = svld1_s16(pg16_8, y[i].bsums); + const svint16_t q8sums_2 = svld1_s16(pg16_8, y[i].bsums + 8); + const svint16_t q6scales_1 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale)); + const svint16_t q6scales_2 = svunpklo_s16(svld1_s8(svptrue_pat_b8(SV_VL8), scale + 8)); + const svint64_t prod = svdup_n_s64(0); + int32_t isum_mins = svaddv_s64(svptrue_b64(), svadd_s64_x(svptrue_b64(), svdot_s64(prod, q8sums_1, q6scales_1), + svdot_s64(prod, q8sums_2, q6scales_2))); + int32_t isum = 0; + + switch (vector_length) { + case 128: + { + const svbool_t pg32_4 = svptrue_pat_b32(SV_VL4); + const svbool_t pg8_16 = svptrue_pat_b8(SV_VL16); + svint32_t isum_tmp = svdup_n_s32(0); + for (int j = 0; j < QK_K/128; ++j) { + svuint8_t qhbits_1 = svld1_u8(pg8_16, qh); + svuint8_t qhbits_2 = svld1_u8(pg8_16, qh+16); + qh += 32; + svuint8_t q6bits_1 = svld1_u8(pg8_16, q6); + svuint8_t q6bits_2 = svld1_u8(pg8_16, q6+16); + svuint8_t q6bits_3 = svld1_u8(pg8_16, q6+32); + svuint8_t q6bits_4 = svld1_u8(pg8_16, q6+48); + q6 += 64; + svint8_t q8bytes_1 = svld1_s8(pg8_16, q8); + svint8_t q8bytes_2 = svld1_s8(pg8_16, q8+16); + svint8_t q8bytes_3 = svld1_s8(pg8_16, q8+32); + svint8_t q8bytes_4 = svld1_s8(pg8_16, q8+48); + q8 += 64; + + q6h_1 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 4)); + q6h_2 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 4)); + q6h_3 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_1, 2)); + q6h_4 = svand_u8_x(pg16_8, mone, svlsl_n_u8_x(pg16_8, qhbits_2, 2)); + q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_1, m4b), q6h_1)); + q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_2, m4b), q6h_2)); + q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_3, m4b), q6h_3)); + q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svand_u8_x(pg8_16, q6bits_4, m4b), q6h_4)); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]); + + scale += 4; + q8bytes_1 = svld1_s8(pg8_16, q8); + q8bytes_2 = svld1_s8(pg8_16, q8+16); + q8bytes_3 = svld1_s8(pg8_16, q8+32); + q8bytes_4 = svld1_s8(pg8_16, q8+48); + q8 += 64; + + q6h_1 = svand_u8_x(pg16_8, mone, qhbits_1); + q6h_2 = svand_u8_x(pg16_8, mone, qhbits_2); + q6h_3 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_1, 2)); + q6h_4 = svand_u8_x(pg16_8, mone, svlsr_n_u8_x(pg16_8, qhbits_2, 2)); + q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_1, 4), q6h_1)); + q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_2, 4), q6h_2)); + q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_3, 4), q6h_3)); + q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_16, svlsr_n_u8_x(pg8_16, q6bits_4, 4), q6h_4)); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale[0]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale[1]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale[2]); + isum_tmp = svmla_n_s32_x(pg32_4, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale[3]); + scale += 4; + } + isum += svaddv_s32(pg32_4, isum_tmp); + sum += d_all * y[i].d * (isum - 32 * isum_mins); + } + break; + case 256: + case 512: + { + const svbool_t pg8_2 = svptrue_pat_b8(SV_VL2); + const svbool_t pg32_8 = svptrue_pat_b32(SV_VL8); + const svbool_t pg8_32 = svptrue_pat_b8(SV_VL32); + svint32_t isum_tmp = svdup_n_s32(0); + for (int j = 0; j < QK_K/128; j++) { + svuint8_t qhbits_1 = svld1_u8(pg8_32, qh); + qh += 32; + svuint8_t q6bits_1 = svld1_u8(pg8_32, q6); + svuint8_t q6bits_2 = svld1_u8(pg8_32, q6+32); + q6 += 64; + svint8_t q8bytes_1 = svld1_s8(pg8_32, q8); + svint8_t q8bytes_2 = svld1_s8(pg8_32, q8+32); + svint8_t q8bytes_3 = svld1_s8(pg8_32, q8+64); + svint8_t q8bytes_4 = svld1_s8(pg8_32, q8+96); + q8 += 128; + q6h_1 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 4)); + q6h_2 = svand_u8_x(pg8_32, mone, svlsl_n_u8_x(pg8_32, qhbits_1, 2)); + q6h_3 = svand_u8_x(pg8_32, mone, qhbits_1); + q6h_4 = svand_u8_x(pg8_32, mone, svlsr_n_u8_x(pg8_32, qhbits_1, 2)); + q6bytes_1 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_1, m4b), q6h_1)); + q6bytes_2 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svand_u8_x(pg8_32, q6bits_2, m4b), q6h_2)); + q6bytes_3 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_1, 4), q6h_3)); + q6bytes_4 = svreinterpret_s8_u8(svorr_u8_x(pg8_32, svlsr_n_u8_x(pg8_32, q6bits_2, 4), q6h_4)); + + svint8_t scale_lane_1_tmp = svld1_s8(pg8_2, scale); + scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp); + scale_lane_1_tmp= svzip1_s8(scale_lane_1_tmp, scale_lane_1_tmp); + svint8_t scale_lane_2_tmp = svld1_s8(pg8_2, scale+2); + scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp); + scale_lane_2_tmp = svzip1_s8(scale_lane_2_tmp, scale_lane_2_tmp); + svint8_t scale_lane_3_tmp = svld1_s8(pg8_2, scale+4); + scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp); + scale_lane_3_tmp = svzip1_s8(scale_lane_3_tmp, scale_lane_3_tmp); + svint8_t scale_lane_4_tmp = svld1_s8(pg8_2, scale+6); + scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp); + scale_lane_4_tmp = svzip1_s8(scale_lane_4_tmp, scale_lane_4_tmp); + svint32_t scale_lane_1 = svunpklo_s32(svunpklo_s16(scale_lane_1_tmp)); + svint32_t scale_lane_2 = svunpklo_s32(svunpklo_s16(scale_lane_2_tmp)); + svint32_t scale_lane_3 = svunpklo_s32(svunpklo_s16(scale_lane_3_tmp)); + svint32_t scale_lane_4 = svunpklo_s32(svunpklo_s16(scale_lane_4_tmp)); + + isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_1, q8bytes_1), scale_lane_1); + isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_2, q8bytes_2), scale_lane_2); + isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_3, q8bytes_3), scale_lane_3); + isum_tmp = svmla_s32_x(pg32_8, isum_tmp, svdot_s32(vzero, q6bytes_4, q8bytes_4), scale_lane_4); + scale += 8; + } + isum += svaddv_s32(pg32_8, isum_tmp); + sum += d_all * y[i].d * (isum - 32 * isum_mins); + } + break; + default: + assert(false && "Unsupported vector length"); + break; + } + } + + *s = sum; + +#elif __ARM_NEON float sum = 0; const uint8x16_t m4b = vdupq_n_u8(0xF); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f2ab4c5d6..75dc96b47 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -8548,6 +8548,69 @@ static void ggml_compute_forward_group_norm( } } +// ggml_compute_forward_l2_norm + +static void ggml_compute_forward_l2_norm_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + GGML_ASSERT(src0->nb[0] == sizeof(float)); + + const int ith = params->ith; + const int nth = params->nth; + + GGML_TENSOR_UNARY_OP_LOCALS + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + GGML_ASSERT(eps >= 0.0f); + + // TODO: optimize + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ith; i01 < ne01; i01 += nth) { + const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + ggml_float sum = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += (ggml_float)(x[i00] * x[i00]); + } + + float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + + memcpy(y, x, ne00 * sizeof(float)); + + const float scale = 1.0f/fmaxf(sqrtf(sum), eps); + + ggml_vec_scale_f32(ne00, y, scale); + } + } + } +} + +static void ggml_compute_forward_l2_norm( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_l2_norm_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_mul_mat static void ggml_compute_forward_mul_mat_one_chunk( @@ -13604,6 +13667,184 @@ static void ggml_compute_forward_gla( } } +// ggml_compute_forward_rwkv_wkv7 + +static void ggml_compute_forward_rwkv_wkv7_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + const int64_t T = dst->src[1]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t HEADS = dst->src[1]->ne[1]; + const int64_t n_seqs = dst->src[6]->ne[1]; + const int64_t head_size = C / HEADS; + + float * dst_data = (float *) dst->data; + float * state = ((float *) dst->data) + C * T; + + const int ith = params->ith; + const int nth = params->nth; + + if (ith >= HEADS) { + return; + } + + const int h_start = (HEADS * ith) / nth; + const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ? + (HEADS * (ith + 1)) / nth : HEADS; + + float * r = (float *) dst->src[0]->data; + float * w = (float *) dst->src[1]->data; + float * k = (float *) dst->src[2]->data; + float * v = (float *) dst->src[3]->data; + float * a = (float *) dst->src[4]->data; + float * b = (float *) dst->src[5]->data; + + int64_t t_stride = HEADS * head_size; // Same to C + + int64_t h_stride = C / HEADS; + GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS + int64_t h_stride_2d = head_size * head_size; + + #if defined(GGML_SIMD) + for (int64_t t = 0; t < T; t++) { + int64_t t_offset = t * t_stride; + int64_t state_offset = head_size * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; + + for (int64_t h = h_start; h < h_end; h++) { + int64_t h_offset = h * h_stride; + int64_t t_h_offset = t_offset + h_offset; + int64_t h_2d_offset = h * h_stride_2d; + + for (int64_t ii = 0; ii < head_size; ii++) { + int64_t t_h_i_offset = t_h_offset + ii; + int64_t h_2d_i_offset = h_2d_offset + ii * h_stride; + + GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]); + + float sa = 0; + { + GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) { + for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) { + ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]); + ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]); + sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]); + } + } + GGML_F32_VEC_REDUCE(sa, sum); + } + + GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa); + + int64_t j = 0; + GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + for (; j < head_size; j += GGML_F32_STEP) { + for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) { + int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR; + int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR; + + GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]); + GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]); + GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]); + GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]); + + k_vec = GGML_F32_VEC_MUL(v_vec, k_vec); + + GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]); + // kv + s * decay + sa * b + state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec); + state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec); + GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec); + + result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec); + } + } + GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec); + + // There shouldn't be left-overs though. + for (; j < head_size; j++) { + int64_t t_h_j_offset = t_h_offset + j; + int64_t h_2d_i_j_offset = h_2d_i_offset + j; + + float r_val = r[t_h_j_offset]; + float w_val = w[t_h_j_offset]; + float k_val = k[t_h_j_offset]; + float b_val = b[t_h_j_offset]; + float kv_val = v[t_h_i_offset] * k_val; + + float prev_state_val = state_prev[h_2d_i_j_offset]; + state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val; + dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val; + } + } + } + } + #else + for (int64_t t = 0; t < T; t++) { + int64_t t_offset = t * t_stride; + int64_t state_offset = head_size * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; + + for (int64_t h = h_start; h < h_end; h++) { + int64_t h_offset = h * h_stride; + int64_t t_h_offset = t_offset + h_offset; + int64_t h_2d_offset = h * h_stride_2d; + + for (int64_t i = 0; i < head_size; i++) { + int64_t t_h_i_offset = t_h_offset + i; + int64_t h_2d_i_offset = h_2d_offset + i * h_stride; + + float v_val = v[t_h_i_offset]; + + float sa = 0, result = 0; + for (int64_t j = 0; j < head_size; j++) { + sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j]; + } + + for (int64_t j = 0; j < head_size; j++) { + int64_t t_h_j_offset = t_h_offset + j; + int64_t h_2d_i_j_offset = h_2d_i_offset + j; + + float r_val = r[t_h_j_offset]; + float w_val = w[t_h_j_offset]; + float k_val = k[t_h_j_offset]; + float b_val = b[t_h_j_offset]; + float kv_val = v_val * k_val; + float prev_state_val = state_prev[h_2d_i_j_offset]; + state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val; + result += state_cur[h_2d_i_j_offset] * r_val; + } + dst_data[t_h_i_offset] = result; + } + } + } + #endif +} + + +static void ggml_compute_forward_rwkv_wkv7( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_rwkv_wkv7_f32(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_map_unary static void ggml_compute_forward_map_unary_f32( @@ -14170,6 +14411,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_group_norm(params, tensor); } break; + case GGML_OP_L2_NORM: + { + ggml_compute_forward_l2_norm(params, tensor); + } break; case GGML_OP_MUL_MAT: { ggml_compute_forward_mul_mat(params, tensor); @@ -14357,6 +14602,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_gla(params, tensor); } break; + case GGML_OP_RWKV_WKV7: + { + ggml_compute_forward_rwkv_wkv7(params, tensor); + } break; case GGML_OP_MAP_UNARY: { ggml_unary_op_f32_t fun; @@ -14582,6 +14831,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: + case GGML_OP_L2_NORM: case GGML_OP_GROUP_NORM: case GGML_OP_CONCAT: case GGML_OP_MUL_MAT: @@ -14648,14 +14898,15 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_FLASH_ATTN_BACK: case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: + case GGML_OP_RWKV_WKV6: + case GGML_OP_GATED_LINEAR_ATTN: + case GGML_OP_RWKV_WKV7: { n_tasks = n_threads; } break; case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: case GGML_OP_GET_REL_POS: - case GGML_OP_RWKV_WKV6: - case GGML_OP_GATED_LINEAR_ATTN: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: case GGML_OP_MAP_CUSTOM1_F32: diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 4d4ac47c0..e78205e5d 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -678,7 +678,7 @@ struct ggml_tensor_extra_gpu { }; -#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS) +#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) #define USE_CUDA_GRAPH #endif diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 497de37be..8fb063822 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -36,7 +36,7 @@ #include "ggml-cuda/tsembd.cuh" #include "ggml-cuda/unary.cuh" #include "ggml-cuda/upscale.cuh" -#include "ggml-cuda/wkv6.cuh" +#include "ggml-cuda/wkv.cuh" #include "ggml-cuda/gla.cuh" #include "ggml.h" @@ -2196,6 +2196,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_GROUP_NORM: ggml_cuda_op_group_norm(ctx, dst); break; + case GGML_OP_L2_NORM: + ggml_cuda_op_l2_norm(ctx, dst); + break; case GGML_OP_CONCAT: ggml_cuda_op_concat(ctx, dst); break; @@ -2304,6 +2307,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_GATED_LINEAR_ATTN: ggml_cuda_op_gated_linear_attn(ctx, dst); break; + case GGML_OP_RWKV_WKV7: + ggml_cuda_op_rwkv_wkv7(ctx, dst); + break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_cuda_cross_entropy_loss_back(ctx, dst); break; @@ -2610,13 +2616,15 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { +#if CUDART_VERSION >= 12000 cudaGraphExecUpdateResultInfo result_info; -#ifdef __HIP_PLATFORM_AMD__ - hipGraphNode_t errorNode; - hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info); -#else cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); -#endif +#else + cudaGraphNode_t errorNode; + cudaGraphExecUpdateResult result_info; + cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info); +#endif // CUDART_VERSION >= 12000 + if (stat == cudaErrorGraphExecUpdateFailure) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__); @@ -3159,6 +3167,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g break; case GGML_OP_NORM: case GGML_OP_RMS_NORM: + case GGML_OP_L2_NORM: return true; case GGML_OP_RMS_NORM_BACK: return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0; @@ -3213,6 +3222,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_LEAKY_RELU: case GGML_OP_RWKV_WKV6: case GGML_OP_GATED_LINEAR_ATTN: + case GGML_OP_RWKV_WKV7: return true; case GGML_OP_FLASH_ATTN_EXT: { #ifndef FLASH_ATTN_AVAILABLE diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index f127616ed..0020dbcec 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -201,6 +201,85 @@ static __global__ void rms_norm_back_f32( } } +// template +// static __global__ void l2_norm_f32(const float * x, float * dst, const int ncols, const float eps) { +// const int row = blockIdx.x*blockDim.y + threadIdx.y; +// const int tid = threadIdx.x; + +// float tmp = 0.0f; // partial sum for thread in warp + +// for (int col = tid; col < ncols; col += block_size) { +// const float xi = x[row*ncols + col]; +// tmp += xi * xi; +// } + +// // sum up partial sums +// tmp = warp_reduce_sum(tmp); +// if (block_size > WARP_SIZE) { +// __shared__ float s_sum[32]; +// int warp_id = threadIdx.x / WARP_SIZE; +// int lane_id = threadIdx.x % WARP_SIZE; +// if (lane_id == 0) { +// s_sum[warp_id] = tmp; +// } +// __syncthreads(); +// tmp = s_sum[lane_id]; +// tmp = warp_reduce_sum(tmp); +// } + +// // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html +// const float scale = rsqrtf(fmaxf(tmp, eps * eps)); + +// for (int col = tid; col < ncols; col += block_size) { +// dst[row*ncols + col] = scale * x[row*ncols + col]; +// } +// } + +template +static __global__ void l2_norm_f32( + const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, + const int64_t stride_sample, const float eps) { + const int nrows = gridDim.x; + const int nchannels = gridDim.y; + + const int row = blockIdx.x; + const int channel = blockIdx.y; + const int sample = blockIdx.z; + const int tid = threadIdx.x; + + x += sample*stride_sample + channel*stride_channel + row*stride_row; + dst += ((sample*nchannels + channel)*nrows + row)*ncols; + + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[col]; + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp); + if constexpr (block_size > WARP_SIZE) { + static_assert(block_size == 1024, "unexpected block_size"); + __shared__ float s_sum[32]; + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + __syncthreads(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp); + } + + // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html + const float scale = rsqrtf(fmaxf(tmp, eps * eps)); + + for (int col = tid; col < ncols; col += block_size) { + dst[col] = scale * x[col]; + } +} + static void norm_f32_cuda( const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) { @@ -248,6 +327,19 @@ static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * } } +static void l2_norm_f32_cuda( + const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, + const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) { + const dim3 blocks_num(nrows, nchannels, nsamples); + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + l2_norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + } else { + const dim3 block_dims(1024, 1, 1); + l2_norm_f32<1024><<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + } +} + void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *) src0->data; @@ -340,3 +432,27 @@ void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * d rms_norm_back_f32_cuda(grad_d, src0f_d, dst_d, ne00, nrows, eps, stream); } + +void ggml_cuda_op_l2_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *) src0->data; + float * dst_d = (float *) dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_TENSOR_UNARY_OP_LOCALS; + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + GGML_ASSERT(eps >= 0.0f); + + const size_t ts0 = ggml_type_size(src0->type); + GGML_ASSERT(nb00 == ts0); + const int64_t s01 = nb01 / ts0; + const int64_t s02 = nb02 / ts0; + const int64_t s03 = nb03 / ts0; + + l2_norm_f32_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream); +} diff --git a/ggml/src/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh index d63d34380..706a5660a 100644 --- a/ggml/src/ggml-cuda/norm.cuh +++ b/ggml/src/ggml-cuda/norm.cuh @@ -7,3 +7,5 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_l2_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 81964611c..aace21e3a 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -112,7 +112,7 @@ #define cudaGraphExecDestroy hipGraphExecDestroy #define cudaGraphLaunch hipGraphLaunch #define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure -#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult +#define cudaGraphExecUpdateResult hipGraphExecUpdateResult #define cudaGraphNodeType hipGraphNodeType #define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel #define cudaGraphInstantiate hipGraphInstantiate diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 6cc1b69ee..997f67143 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -119,7 +119,7 @@ #define cudaGraphExecDestroy musaGraphExecDestroy #define cudaGraphExec_t musaGraphExec_t #define cudaGraphExecUpdate musaGraphExecUpdate -#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult +#define cudaGraphExecUpdateResult musaGraphExecUpdateResult #define cudaGraphGetNodes musaGraphGetNodes #define cudaGraphInstantiate musaGraphInstantiate #define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams @@ -132,6 +132,7 @@ #define cudaGraph_t musaGraph_t #define cudaKernelNodeParams musaKernelNodeParams #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed +#define cudaStreamBeginCapture musaStreamBeginCapture #define cudaStreamEndCapture musaStreamEndCapture typedef mt_bfloat16 nv_bfloat16; diff --git a/ggml/src/ggml-cuda/wkv.cu b/ggml/src/ggml-cuda/wkv.cu new file mode 100644 index 000000000..d2fced705 --- /dev/null +++ b/ggml/src/ggml-cuda/wkv.cu @@ -0,0 +1,199 @@ +#include "common.cuh" +#include "wkv.cuh" + +template +static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) { + const int tid = threadIdx.x; + const int bid = blockIdx.x; + + const int head_size = block_size; + const int batch_i = bid / H; + const int head_i = bid % H; + const int state_size = C * head_size; + const int n_seq_tokens = T / B; + + float state[head_size]; + __shared__ float _k[head_size], _r[head_size], _tf[head_size], _td[head_size]; + + #pragma unroll + for (int i = 0; i < head_size; i++) { + state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid]; + } + + __syncthreads(); + _tf[tid] = tf[head_i * head_size + tid]; + __syncthreads(); + + for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) { + __syncthreads(); + _k[tid] = k[t]; + _r[tid] = r[t]; + _td[tid] = td[t]; + __syncthreads(); + + const float _v = v[t]; + float y = 0; + for (int j = 0; j < head_size; j += 4) { + const float4& k = (float4&)(_k[j]); + const float4& r = (float4&)(_r[j]); + const float4& tf = (float4&)(_tf[j]); + const float4& td = (float4&)(_td[j]); + float4& s = (float4&)(state[j]); + float4 kv; + + kv.x = k.x * _v; + kv.y = k.y * _v; + kv.z = k.z * _v; + kv.w = k.w * _v; + + y += r.x * (tf.x * kv.x + s.x); + y += r.y * (tf.y * kv.y + s.y); + y += r.z * (tf.z * kv.z + s.z); + y += r.w * (tf.w * kv.w + s.w); + + s.x = s.x * td.x + kv.x; + s.y = s.y * td.y + kv.y; + s.z = s.z * td.z + kv.z; + s.w = s.w * td.w + kv.w; + } + dst[t] = y; + } + + #pragma unroll + for (int i = 0; i < head_size; i++) { + dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; + } +} + +template +static __global__ void rwkv_wkv7_f32(const int B, const int T, const int C, const int H, const float * r, const float * w, const float * k, const float * v, const float * a, const float * b, const float * s, float * dst) { + const int tid = threadIdx.x; + const int bid = blockIdx.x; + + const int head_size = block_size; + const int batch_i = bid / H; + const int head_i = bid % H; + const int state_size = C * head_size; + const int n_seq_tokens = T / B; + + float state[head_size]; + __shared__ float _r[head_size], _w[head_size], _k[head_size], _a[head_size], _b[head_size]; + +#ifndef GGML_USE_MUSA + #pragma unroll +#endif + for (int i = 0; i < head_size; i++) { + state[i] = s[batch_i * state_size + head_i * head_size * head_size + tid * head_size + i]; + } + + for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) { + __syncthreads(); + _r[tid] = r[t]; + _w[tid] = w[t]; + _k[tid] = k[t]; + _a[tid] = a[t]; + _b[tid] = b[t]; + __syncthreads(); + + float sa = 0; + #pragma unroll + for (int j = 0; j < head_size; j += 4) + { + const float4& a = (float4&)(_a[j]); + const float4& s = (float4&)(state[j]); + sa += a.x * s.x; + sa += a.y * s.y; + sa += a.z * s.z; + sa += a.w * s.w; + } + + const float _v = v[t]; + float y = 0; + for (int j = 0; j < head_size; j += 4) { + const float4& r = (float4&)(_r[j]); + const float4& w = (float4&)(_w[j]); + const float4& k = (float4&)(_k[j]); + const float4& b = (float4&)(_b[j]); + float4& s = (float4&)(state[j]); + float4 kv; + + kv.x = k.x * _v; + kv.y = k.y * _v; + kv.z = k.z * _v; + kv.w = k.w * _v; + + s.x = s.x * w.x + kv.x + sa * b.x; + s.y = s.y * w.y + kv.y + sa * b.y; + s.z = s.z * w.z + kv.z + sa * b.z; + s.w = s.w * w.w + kv.w + sa * b.w; + + y += s.x * r.x; + y += s.y * r.y; + y += s.z * r.z; + y += s.w * r.w; + } + dst[t] = y; + } + + #pragma unroll + for (int i = 0; i < head_size; i++) { + dst[T * C + batch_i * state_size + head_i * head_size * head_size + tid * head_size + i] = state[i]; + } +} + +void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const float * k_d = (const float *)dst->src[0]->data; + const float * v_d = (const float *)dst->src[1]->data; + const float * r_d = (const float *)dst->src[2]->data; + const float * tf_d = (const float *)dst->src[3]->data; + const float * td_d = (const float *)dst->src[4]->data; + const float * s_d = (const float *)dst->src[5]->data; + + const int64_t B = dst->src[5]->ne[1]; + const int64_t T = dst->src[0]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[1]; + + float * dst_d = (float *)dst->data; + + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE || C / H == CUDA_WKV_BLOCK_SIZE * 2); + + if (C / H == CUDA_WKV_BLOCK_SIZE) { + rwkv_wkv_f32<<>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d); + } else { + rwkv_wkv_f32<<>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d); + } +} + +void ggml_cuda_op_rwkv_wkv7(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const float * r_d = (const float *)dst->src[0]->data; + const float * w_d = (const float *)dst->src[1]->data; + const float * k_d = (const float *)dst->src[2]->data; + const float * v_d = (const float *)dst->src[3]->data; + const float * a_d = (const float *)dst->src[4]->data; + const float * b_d = (const float *)dst->src[5]->data; + const float * s_d = (const float *)dst->src[6]->data; + + const int64_t B = dst->src[6]->ne[1]; + const int64_t T = dst->src[0]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[1]; + + float * dst_d = (float *)dst->data; + + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(dst->src[6]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE || C / H == CUDA_WKV_BLOCK_SIZE * 2); + + if (C / H == CUDA_WKV_BLOCK_SIZE) { + rwkv_wkv7_f32<<>>(B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d); + } else { + rwkv_wkv7_f32<<>>(B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d); + } +} diff --git a/ggml/src/ggml-cuda/wkv6.cuh b/ggml/src/ggml-cuda/wkv.cuh similarity index 62% rename from ggml/src/ggml-cuda/wkv6.cuh rename to ggml/src/ggml-cuda/wkv.cuh index a7124ee51..9623dd7f8 100644 --- a/ggml/src/ggml-cuda/wkv6.cuh +++ b/ggml/src/ggml-cuda/wkv.cuh @@ -3,3 +3,5 @@ #define CUDA_WKV_BLOCK_SIZE 64 void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_rwkv_wkv7(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/wkv6.cu b/ggml/src/ggml-cuda/wkv6.cu deleted file mode 100644 index bbdafbee5..000000000 --- a/ggml/src/ggml-cuda/wkv6.cu +++ /dev/null @@ -1,89 +0,0 @@ -#include "common.cuh" -#include "wkv6.cuh" - -static __global__ void rwkv_wkv_f32(const int B, const int T, const int C, const int H, const float * k, const float * v, const float * r, const float * tf, const float * td, const float * s, float * dst) { - const int tid = threadIdx.x; - const int bid = blockIdx.x; - - const int head_size = CUDA_WKV_BLOCK_SIZE; - const int batch_i = bid / H; - const int head_i = bid % H; - const int state_size = C * head_size; - const int n_seq_tokens = T / B; - - float state[head_size]; - __shared__ float _k[head_size], _r[head_size], _tf[head_size], _td[head_size]; - - #pragma unroll - for (int i = 0; i < head_size; i++) { - state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid]; - } - - __syncthreads(); - _tf[tid] = tf[head_i * head_size + tid]; - __syncthreads(); - - for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; t += C) { - __syncthreads(); - _k[tid] = k[t]; - _r[tid] = r[t]; - _td[tid] = td[t]; - __syncthreads(); - - const float _v = v[t]; - float y = 0; - for (int j = 0; j < head_size; j += 4) { - const float4& k = (float4&)(_k[j]); - const float4& r = (float4&)(_r[j]); - const float4& tf = (float4&)(_tf[j]); - const float4& td = (float4&)(_td[j]); - float4& s = (float4&)(state[j]); - float4 kv; - - kv.x = k.x * _v; - kv.y = k.y * _v; - kv.z = k.z * _v; - kv.w = k.w * _v; - - y += r.x * (tf.x * kv.x + s.x); - y += r.y * (tf.y * kv.y + s.y); - y += r.z * (tf.z * kv.z + s.z); - y += r.w * (tf.w * kv.w + s.w); - - s.x = s.x * td.x + kv.x; - s.y = s.y * td.y + kv.y; - s.z = s.z * td.z + kv.z; - s.w = s.w * td.w + kv.w; - } - dst[t] = y; - } - - #pragma unroll - for (int i = 0; i < head_size; i++) { - dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; - } -} - -void ggml_cuda_op_rwkv_wkv6(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const float * k_d = (const float *)dst->src[0]->data; - const float * v_d = (const float *)dst->src[1]->data; - const float * r_d = (const float *)dst->src[2]->data; - const float * tf_d = (const float *)dst->src[3]->data; - const float * td_d = (const float *)dst->src[4]->data; - const float * s_d = (const float *)dst->src[5]->data; - - const int64_t B = dst->src[5]->ne[1]; - const int64_t T = dst->src[0]->ne[2]; - const int64_t C = dst->ne[0]; - const int64_t H = dst->src[0]->ne[1]; - - float * dst_d = (float *)dst->data; - - cudaStream_t stream = ctx.stream(); - - GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); - GGML_ASSERT(C % H == 0); - GGML_ASSERT(C / H == CUDA_WKV_BLOCK_SIZE); // The current cuda kernel is designed for RWKV6, HEAD_SIZE == 64 - - rwkv_wkv_f32<<>>(B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d); -} diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index a58c474eb..1e954b4ce 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -285,6 +285,13 @@ typedef struct { float eps; } ggml_metal_kargs_rms_norm; +typedef struct { + int32_t ne00; + int32_t ne00_4; + uint64_t nb01; + float eps; +} ggml_metal_kargs_l2_norm; + typedef struct { int64_t ne00; int64_t ne01; diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index e51a4169a..af65e7d9f 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -184,10 +184,13 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, GGML_METAL_KERNEL_TYPE_RMS_NORM, + GGML_METAL_KERNEL_TYPE_L2_NORM, GGML_METAL_KERNEL_TYPE_GROUP_NORM, GGML_METAL_KERNEL_TYPE_NORM, GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, + GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, + GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, @@ -810,10 +813,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, has_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_L2_NORM, l2_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_CONV_F32, ssm_conv_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SSM_SCAN_F32, ssm_scan_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32, rwkv_wkv6_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32, rwkv_wkv7_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, has_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32, mul_mv_bf16_f32, has_simdgroup_reduction && use_bfloat); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW, mul_mv_bf16_f32_1row, has_simdgroup_reduction && use_bfloat); @@ -1251,6 +1257,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_GROUP_NORM: return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]); case GGML_OP_RMS_NORM: + case GGML_OP_L2_NORM: return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); case GGML_OP_ARGMAX: return true; @@ -1288,6 +1295,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex return has_simdgroup_mm; // TODO: over-restricted for vec-kernels case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: + case GGML_OP_RWKV_WKV6: + case GGML_OP_RWKV_WKV7: return true; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: @@ -2216,6 +2225,83 @@ static void ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(d_inner, n_seqs, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_OP_RWKV_WKV6: + { + const int64_t B = dst->src[5]->ne[1]; + const int64_t T = dst->src[0]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[1]; + + GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == 64); + + size_t offs_src3 = 0; + size_t offs_src4 = 0; + size_t offs_src5 = 0; + + id id_src3 = dst->src[3] ? ggml_metal_get_buffer(dst->src[3], &offs_src3) : nil; + id id_src4 = dst->src[4] ? ggml_metal_get_buffer(dst->src[4], &offs_src4) : nil; + id id_src5 = dst->src[5] ? ggml_metal_get_buffer(dst->src[5], &offs_src5) : nil; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; + [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4]; + [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:6]; + + [encoder setBytes:&B length:sizeof(B) atIndex:7]; + [encoder setBytes:&T length:sizeof(T) atIndex:8]; + [encoder setBytes:&C length:sizeof(C) atIndex:9]; + [encoder setBytes:&H length:sizeof(H) atIndex:10]; + + [encoder dispatchThreadgroups:MTLSizeMake(B * H, 1, 1) threadsPerThreadgroup:MTLSizeMake(C/ H, 1, 1)]; + } break; + case GGML_OP_RWKV_WKV7: + { + const int64_t B = dst->src[6]->ne[1]; + const int64_t T = dst->src[0]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[1]; + + GGML_ASSERT(dst->src[6]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == 64); + + size_t offs_src3 = 0; + size_t offs_src4 = 0; + size_t offs_src5 = 0; + size_t offs_src6 = 0; + + id id_src3 = dst->src[3] ? ggml_metal_get_buffer(dst->src[3], &offs_src3) : nil; + id id_src4 = dst->src[4] ? ggml_metal_get_buffer(dst->src[4], &offs_src4) : nil; + id id_src5 = dst->src[5] ? ggml_metal_get_buffer(dst->src[5], &offs_src5) : nil; + id id_src6 = dst->src[6] ? ggml_metal_get_buffer(dst->src[6], &offs_src6) : nil; + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; + [encoder setBuffer:id_src4 offset:offs_src4 atIndex:4]; + [encoder setBuffer:id_src5 offset:offs_src5 atIndex:5]; + [encoder setBuffer:id_src6 offset:offs_src6 atIndex:6]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:7]; + + [encoder setBytes:&B length:sizeof(B) atIndex:8]; + [encoder setBytes:&T length:sizeof(T) atIndex:9]; + [encoder setBytes:&C length:sizeof(C) atIndex:10]; + [encoder setBytes:&H length:sizeof(H) atIndex:11]; + + [encoder dispatchThreadgroups:MTLSizeMake(B * H, 1, 1) threadsPerThreadgroup:MTLSizeMake(C/ H, 1, 1)]; + } break; case GGML_OP_MUL_MAT: { GGML_ASSERT(ne00 == ne10); @@ -3122,6 +3208,42 @@ static void ggml_metal_encode_node( const int64_t nrows = ggml_nrows(src0); + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_L2_NORM: + { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(ggml_is_contiguous_1(src0)); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_L2_NORM].pipeline; + + int nth = 32; // SIMD width + + while (nth < ne00/4 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) { + nth *= 2; + } + + nth = MIN(nth, ne00/4); + + ggml_metal_kargs_l2_norm args = { + /*.ne00 =*/ ne00, + /*.ne00_4 =*/ ne00/4, + /*.nb01 =*/ nb01, + /*.eps =*/ eps, + }; + + [encoder setComputePipelineState:pipeline]; + [encoder setBytes:&args length:sizeof(args) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_GROUP_NORM: diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index ad9d42a3e..3cef81b79 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1295,6 +1295,184 @@ kernel void kernel_ssm_scan_f32( } } +kernel void kernel_rwkv_wkv6_f32( + device const float * k, + device const float * v, + device const float * r, + device const float * tf, + device const float * td, + device const float * state_in, + device float * dst, + constant uint & B, + constant uint & T, + constant uint & C, + constant uint & H, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const uint head_size = 64; // TODO: support head_size = 128 + const uint batch_id = tgpig.x / H; + const uint head_id = tgpig.x % H; + const uint tid = tpitg.x; + + if (batch_id >= B || head_id >= H) { + return; + } + + const uint state_size = C * head_size; + const uint n_seq_tokens = T / B; + + threadgroup float _k[head_size]; + threadgroup float _r[head_size]; + threadgroup float _tf[head_size]; + threadgroup float _td[head_size]; + + float state[head_size]; + + for (uint i = 0; i < head_size; i++) { + state[i] = state_in[batch_id * state_size + head_id * head_size * head_size + + i * head_size + tid]; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + _tf[tid] = tf[head_id * head_size + tid]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid; + const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid; + + for (uint t = start_t; t < end_t; t += C) { + threadgroup_barrier(mem_flags::mem_threadgroup); + _k[tid] = k[t]; + _r[tid] = r[t]; + _td[tid] = td[t]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + const float v_val = v[t]; + float y = 0.0; + + for (uint j = 0; j < head_size; j += 4) { + float4 k_vec = float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + float4 r_vec = float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + float4 tf_vec = float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]); + float4 td_vec = float4(_td[j], _td[j+1], _td[j+2], _td[j+3]); + float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); + + float4 kv = k_vec * v_val; + + float4 temp = tf_vec * kv + s_vec; + y += dot(r_vec, temp); + + s_vec = s_vec * td_vec + kv; + state[j] = s_vec[0]; + state[j+1] = s_vec[1]; + state[j+2] = s_vec[2]; + state[j+3] = s_vec[3]; + } + + dst[t] = y; + } + + for (uint i = 0; i < head_size; i++) { + dst[T * C + batch_id * state_size + head_id * head_size * head_size + + i * head_size + tid] = state[i]; + } +} + +kernel void kernel_rwkv_wkv7_f32( + device const float * r, + device const float * w, + device const float * k, + device const float * v, + device const float * a, + device const float * b, + device const float * state_in, + device float * dst, + constant uint & B, + constant uint & T, + constant uint & C, + constant uint & H, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + + const uint head_size = 64; // TODO: support head_size = 128 + const uint batch_id = tgpig.x / H; + const uint head_id = tgpig.x % H; + const uint tid = tpitg.x; + + if (batch_id >= B || head_id >= H) { + return; + } + + const uint state_size = C * head_size; + const uint n_seq_tokens = T / B; + + threadgroup float _r[head_size]; + threadgroup float _w[head_size]; + threadgroup float _k[head_size]; + threadgroup float _a[head_size]; + threadgroup float _b[head_size]; + + float state[head_size]; + + for (uint i = 0; i < head_size; i++) { + state[i] = state_in[batch_id * state_size + head_id * head_size * head_size + + tid * head_size + i]; + } + + const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid; + const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid; + + for (uint t = start_t; t < end_t; t += C) { + threadgroup_barrier(mem_flags::mem_threadgroup); + _r[tid] = r[t]; + _w[tid] = w[t]; + _k[tid] = k[t]; + _a[tid] = a[t]; + _b[tid] = b[t]; + threadgroup_barrier(mem_flags::mem_threadgroup); + + const float v_val = v[t]; + float y = 0.0, sa = 0.0; + + float4 sa_vec(0.0); + + for (int j = 0; j < head_size; j += 4) { + float4 a_vec = float4(_a[j], _a[j+1], _a[j+2], _a[j+3]); + float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); + sa_vec += a_vec * s_vec; + } + sa = sa_vec[0] + sa_vec[1] + sa_vec[2] + sa_vec[3]; + + for (uint j = 0; j < head_size; j += 4) { + float4 r_vec = float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + float4 w_vec = float4(_w[j], _w[j+1], _w[j+2], _w[j+3]); + float4 k_vec = float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + float4 b_vec = float4(_b[j], _b[j+1], _b[j+2], _b[j+3]); + float4 s_vec = float4(state[j], state[j+1], state[j+2], state[j+3]); + + float4 kv = k_vec * v_val; + + s_vec = s_vec * w_vec + kv + sa * b_vec; + y += dot(s_vec, r_vec); + + state[j] = s_vec[0]; + state[j+1] = s_vec[1]; + state[j+2] = s_vec[2]; + state[j+3] = s_vec[3]; + } + + dst[t] = y; + } + + for (uint i = 0; i < head_size; i++) { + dst[T * C + batch_id * state_size + head_id * head_size * head_size + + tid * head_size + i] = state[i]; + } +} + kernel void kernel_argmax( device const void * x, device int32_t * dst, @@ -1463,6 +1641,49 @@ kernel void kernel_rms_norm( } } +kernel void kernel_l2_norm( + constant ggml_metal_kargs_l2_norm & args, + device const char * src0, + device char * dst, + threadgroup float * shmem_f32 [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + ushort tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort ntg[[threads_per_threadgroup]]) { + if (sgitg == 0) { + shmem_f32[tiisg] = 0.0f; + } + + device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01); + + float sumf = 0.0f; + + // parallel sum + for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) { + sumf += dot(x[i00], x[i00]); + } + sumf = simd_sum(sumf); + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + shmem_f32[sgitg] = sumf; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + sumf = shmem_f32[tiisg]; + sumf = simd_sum(sumf); + + const float scale = 1.0f/sqrt(max(sumf, args.eps)); + + device float4 * y = (device float4 *) dst + tgpig*args.ne00_4; + for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) { + y[i00] = x[i00] * scale; + } +} + kernel void kernel_group_norm( device const float * src0, device float * dst, diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index 166970ca6..92f05d555 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -67,10 +67,6 @@ if (MUSAToolkit_FOUND) add_compile_definitions(GGML_USE_MUSA) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) - if (GGML_CUDA_GRAPHS) - add_compile_definitions(GGML_CUDA_USE_GRAPHS) - endif() - if (GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 3ad044432..271413ca4 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -66,6 +66,9 @@ if (WIN32) find_package(MKL REQUIRED) target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL) else() + if (GGML_SYCL_GRAPH) + add_compile_definitions(GGML_SYCL_GRAPH) + endif() if (GGML_SYCL_TARGET STREQUAL "INTEL") target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 577ff51fd..73d807cab 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -26,7 +26,7 @@ #include "softmax.hpp" #include "tsembd.hpp" #include "im2col.hpp" -#include "wkv6.hpp" +#include "wkv.hpp" #include "outprod.hpp" #include "element_wise.hpp" #include "cpy.hpp" diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index a92988b7d..7cc5e14f9 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -301,6 +301,7 @@ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) { return opt; } +namespace sycl_ex = sycl::ext::oneapi::experimental; struct ggml_backend_sycl_context { int device; std::string name; @@ -392,6 +393,10 @@ struct ggml_backend_sycl_context { return pool(device); } +#ifdef GGML_SYCL_GRAPH + std::unique_ptr> exec_graph = nullptr; +#endif + ggml_sycl_pool & host_pool(int device) { if (host_pools[device] == nullptr) { host_pools[device] = new_pool_for_host(stream(device, 0), device); @@ -474,6 +479,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13, /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, /*int s10,*/ int s11, int s12, int s13, const sycl::nd_item<3> &item_ct1) { const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + @@ -495,9 +501,9 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, const int i12 = i2 % ne12; const int i13 = i3 % ne13; - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; const src0_t * src0_row = src0 + i_src0; const src1_t * src1_row = src1 + i_src1; @@ -515,6 +521,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t int ne0, int ne1, int ne2, int ne3, int ne10, int ne11, int ne12, int ne13, /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, /*int s10,*/ int s11, int s12, int s13, const sycl::nd_item<3> &item_ct1) { @@ -534,9 +541,9 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t const int i12 = i2 % ne12; const int i13 = i3 % ne13; - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; const src0_t * src0_row = src0 + i_src0; const src1_t * src1_row = src1 + i_src1; @@ -566,9 +573,11 @@ struct bin_bcast_sycl { int nr[4] = { nr0, nr1, nr2, nr3 }; // collapse dimensions until first broadcast dimension - int64_t cne0[] = {ne0, ne1, ne2, ne3}; + int64_t cne[] = {ne0, ne1, ne2, ne3}; + int64_t cne0[] = {ne00, ne01, ne02, ne03}; int64_t cne1[] = {ne10, ne11, ne12, ne13}; - size_t cnb0[] = {nb0, nb1, nb2, nb3}; + size_t cnb[] = {nb0, nb1, nb2, nb3}; + size_t cnb0[] = {nb00, nb01, nb02, nb03}; size_t cnb1[] = {nb10, nb11, nb12, nb13}; auto collapse = [](int64_t cne[]) { cne[0] *= cne[1]; @@ -583,32 +592,41 @@ struct bin_bcast_sycl { cnb[3] *= cne[3]; }; - for (int i = 0; i < 4; i++) { - if (nr[i] != 1) { - break; - } - if (i > 0) { - collapse_nb(cnb0, cne0); - collapse_nb(cnb1, cne1); - collapse(cne0); - collapse(cne1); + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb, cne); + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne); + collapse(cne0); + collapse(cne1); + } } } { - int64_t ne0 = cne0[0]; - int64_t ne1 = cne0[1]; - int64_t ne2 = cne0[2]; - int64_t ne3 = cne0[3]; + int64_t ne0 = cne[0]; + int64_t ne1 = cne[1]; + int64_t ne2 = cne[2]; + int64_t ne3 = cne[3]; int64_t ne10 = cne1[0]; int64_t ne11 = cne1[1]; int64_t ne12 = cne1[2]; int64_t ne13 = cne1[3]; - size_t nb0 = cnb0[0]; - size_t nb1 = cnb0[1]; - size_t nb2 = cnb0[2]; - size_t nb3 = cnb0[3]; + size_t nb0 = cnb[0]; + size_t nb1 = cnb[1]; + size_t nb2 = cnb[2]; + size_t nb3 = cnb[3]; + + size_t nb00 = cnb0[0]; + size_t nb01 = cnb0[1]; + size_t nb02 = cnb0[2]; + size_t nb03 = cnb0[3]; size_t nb10 = cnb1[0]; size_t nb11 = cnb1[1]; @@ -625,6 +643,28 @@ struct bin_bcast_sycl { size_t s12 = nb12 / sizeof(src1_t); size_t s13 = nb13 / sizeof(src1_t); + size_t s00 = nb00 / sizeof(src0_t); + size_t s01 = nb01 / sizeof(src0_t); + size_t s02 = nb02 / sizeof(src0_t); + size_t s03 = nb03 / sizeof(src0_t); + + GGML_UNUSED(s00); + + GGML_ASSERT(nb0 % sizeof(dst_t) == 0); + GGML_ASSERT(nb1 % sizeof(dst_t) == 0); + GGML_ASSERT(nb2 % sizeof(dst_t) == 0); + GGML_ASSERT(nb3 % sizeof(dst_t) == 0); + + GGML_ASSERT(nb00 % sizeof(src0_t) == 0); + GGML_ASSERT(nb01 % sizeof(src0_t) == 0); + GGML_ASSERT(nb02 % sizeof(src0_t) == 0); + GGML_ASSERT(nb03 % sizeof(src0_t) == 0); + + GGML_ASSERT(nb10 % sizeof(src1_t) == 0); + GGML_ASSERT(nb11 % sizeof(src1_t) == 0); + GGML_ASSERT(nb12 % sizeof(src1_t) == 0); + GGML_ASSERT(nb13 % sizeof(src1_t) == 0); + GGML_ASSERT(s0 == 1); GGML_ASSERT(s10 == 1); @@ -661,8 +701,8 @@ struct bin_bcast_sycl { [=](sycl::nd_item<3> item_ct1) { k_bin_bcast_unravel( src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, - s13, item_ct1); + ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02, + s03, s11, s12, s13, item_ct1); }); } } else { @@ -680,7 +720,7 @@ struct bin_bcast_sycl { [=](sycl::nd_item<3> item_ct1) { k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, - s1, s2, s3, s11, s12, s13, + s1, s2, s3, s01, s02, s03, s11, s12, s13, item_ct1); }); } diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index 86b200e07..76ac6a4dd 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -138,7 +138,7 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE), sycl::range<3>(1, 1, WARP_SIZE)), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{ + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{ dequantize_block_q4_0_reorder(vx, y, k, item_ct1); }); diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 99d3859de..04a85fa35 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -210,7 +210,7 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1); }); @@ -879,7 +879,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloa stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_reorder( vx, y, dst, ncols, nrows, item_ct1); }); @@ -902,7 +902,7 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec( vx, y, dst, ncols, nrows, item_ct1); }); @@ -923,7 +923,7 @@ static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec( vx, y, dst, ncols, nrows, item_ct1); }); @@ -944,7 +944,7 @@ static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec( vx, y, dst, ncols, nrows, item_ct1); }); @@ -965,7 +965,7 @@ static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec( vx, y, dst, ncols, nrows, item_ct1); }); @@ -986,7 +986,7 @@ static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec( vx, y, dst, ncols, nrows, item_ct1); }); @@ -1004,7 +1004,7 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1020,7 +1020,7 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1036,7 +1036,7 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1049,7 +1049,7 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); }); } @@ -1065,7 +1065,7 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1143,7 +1143,6 @@ void ggml_sycl_op_dequantize_mul_mat_vec( default: printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); GGML_ABORT("fatal error"); - break; } GGML_UNUSED(src1); diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 4bcd74376..1e12cb220 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,7 +1,7 @@ #include "common.hpp" #include "element_wise.hpp" -void acc_f32(const float * x, const float * y, float * dst, const int ne, +static void acc_f32(const float * x, const float * y, float * dst, const int ne, const int ne10, const int ne11, const int ne12, const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + @@ -20,7 +20,7 @@ void acc_f32(const float * x, const float * y, float * dst, const int ne, } } -void gelu_f32(const float * x, float * dst, const int k, +static void gelu_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const float GELU_COEF_A = 0.044715f; const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; @@ -37,7 +37,7 @@ void gelu_f32(const float * x, float * dst, const int k, sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi))); } -void silu_f32(const float * x, float * dst, const int k, +static void silu_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -48,7 +48,7 @@ void silu_f32(const float * x, float * dst, const int k, dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i])); } -void gelu_quick_f32(const float *x, float *dst, int k, +static void gelu_quick_f32(const float *x, float *dst, int k, const sycl::nd_item<3> &item_ct1) { const float GELU_QUICK_COEF = -1.702f; const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + @@ -59,7 +59,7 @@ void gelu_quick_f32(const float *x, float *dst, int k, dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i]))); } -void tanh_f32(const float *x, float *dst, int k, +static void tanh_f32(const float *x, float *dst, int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -69,7 +69,7 @@ void tanh_f32(const float *x, float *dst, int k, dst[i] = sycl::tanh((float)(x[i])); } -void relu_f32(const float * x, float * dst, const int k, +static void relu_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -80,7 +80,7 @@ void relu_f32(const float * x, float * dst, const int k, dst[i] = sycl::fmax((float)(x[i]), (float)0); } -void sigmoid_f32(const float * x, float * dst, const int k, +static void sigmoid_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -91,7 +91,7 @@ void sigmoid_f32(const float * x, float * dst, const int k, dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i])); } -void sqrt_f32(const float * x, float * dst, const int k, +static void sqrt_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -102,7 +102,7 @@ void sqrt_f32(const float * x, float * dst, const int k, dst[i] = sycl::sqrt(x[i]); } -void sin_f32(const float * x, float * dst, const int k, +static void sin_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -113,7 +113,7 @@ void sin_f32(const float * x, float * dst, const int k, dst[i] = sycl::sin(x[i]); } -void cos_f32(const float * x, float * dst, const int k, +static void cos_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -124,7 +124,7 @@ void cos_f32(const float * x, float * dst, const int k, dst[i] = sycl::cos(x[i]); } -void hardsigmoid_f32(const float * x, float * dst, const int k, +static void hardsigmoid_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -135,7 +135,7 @@ void hardsigmoid_f32(const float * x, float * dst, const int k, dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f)); } -void hardswish_f32(const float * x, float * dst, const int k, +static void hardswish_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -146,7 +146,7 @@ void hardswish_f32(const float * x, float * dst, const int k, dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f)); } -void exp_f32(const float * x, float * dst, const int k, +static void exp_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -157,7 +157,7 @@ void exp_f32(const float * x, float * dst, const int k, dst[i] = sycl::exp(x[i]); } -void log_f32(const float * x, float * dst, const int k, +static void log_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -173,7 +173,7 @@ void log_f32(const float * x, float * dst, const int k, } } -void neg_f32(const float * x, float * dst, const int k, +static void neg_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -184,7 +184,7 @@ void neg_f32(const float * x, float * dst, const int k, dst[i] = -x[i]; } -void step_f32(const float * x, float * dst, const int k, +static void step_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -195,7 +195,7 @@ void step_f32(const float * x, float * dst, const int k, dst[i] = x[i] > 0.0f; } -void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope, +static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -206,7 +206,7 @@ void leaky_relu_f32(const float *x, float *dst, const int k, const float negativ sycl::fmin((float)(x[i]), 0.0f) * negative_slope; } -void sqr_f32(const float * x, float * dst, const int k, +static void sqr_f32(const float * x, float * dst, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); @@ -217,7 +217,7 @@ void sqr_f32(const float * x, float * dst, const int k, dst[i] = x[i] * x[i]; } -void upscale_f32(const float *x, float *dst, const int nb00, const int nb01, +static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int ne13, const float sf0, const float sf1, const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) { @@ -240,7 +240,7 @@ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01, dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); } -void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, +static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02, const sycl::nd_item<3> &item_ct1) { int nidx = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); @@ -262,7 +262,7 @@ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const i -void acc_f32_sycl(const float *x, const float *y, float *dst, +static void acc_f32_sycl(const float *x, const float *y, float *dst, const int n_elements, const int ne10, const int ne11, const int ne12, const int nb1, const int nb2, const int offset, queue_ptr stream) { @@ -277,7 +277,7 @@ void acc_f32_sycl(const float *x, const float *y, float *dst, }); } -void gelu_f32_sycl(const float *x, float *dst, const int k, +static void gelu_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; stream->parallel_for( @@ -289,7 +289,7 @@ void gelu_f32_sycl(const float *x, float *dst, const int k, }); } -void silu_f32_sycl(const float *x, float *dst, const int k, +static void silu_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE; stream->parallel_for( @@ -301,7 +301,7 @@ void silu_f32_sycl(const float *x, float *dst, const int k, }); } -void gelu_quick_f32_sycl(const float *x, float *dst, const int k, +static void gelu_quick_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE; stream->parallel_for( @@ -313,7 +313,7 @@ void gelu_quick_f32_sycl(const float *x, float *dst, const int k, }); } -void tanh_f32_sycl(const float *x, float *dst, const int k, +static void tanh_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE; stream->parallel_for( @@ -325,7 +325,7 @@ void tanh_f32_sycl(const float *x, float *dst, const int k, }); } -void relu_f32_sycl(const float *x, float *dst, const int k, +static void relu_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; stream->parallel_for( @@ -337,7 +337,7 @@ void relu_f32_sycl(const float *x, float *dst, const int k, }); } -void hardsigmoid_f32_sycl(const float *x, float *dst, const int k, +static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE; stream->parallel_for( @@ -349,7 +349,7 @@ void hardsigmoid_f32_sycl(const float *x, float *dst, const int k, }); } -void hardswish_f32_sycl(const float *x, float *dst, const int k, +static void hardswish_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE; stream->parallel_for( @@ -361,7 +361,7 @@ void hardswish_f32_sycl(const float *x, float *dst, const int k, }); } -void exp_f32_sycl(const float *x, float *dst, const int k, +static void exp_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; stream->parallel_for( @@ -373,7 +373,7 @@ void exp_f32_sycl(const float *x, float *dst, const int k, }); } -void log_f32_sycl(const float *x, float *dst, const int k, +static void log_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE; stream->parallel_for( @@ -385,7 +385,7 @@ void log_f32_sycl(const float *x, float *dst, const int k, }); } -void neg_f32_sycl(const float *x, float *dst, const int k, +static void neg_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; stream->parallel_for( @@ -397,7 +397,7 @@ void neg_f32_sycl(const float *x, float *dst, const int k, }); } -void step_f32_sycl(const float *x, float *dst, const int k, +static void step_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE; stream->parallel_for( @@ -409,7 +409,7 @@ void step_f32_sycl(const float *x, float *dst, const int k, }); } -void sigmoid_f32_sycl(const float *x, float *dst, const int k, +static void sigmoid_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE; stream->parallel_for( @@ -421,7 +421,7 @@ void sigmoid_f32_sycl(const float *x, float *dst, const int k, }); } -void sqrt_f32_sycl(const float *x, float *dst, const int k, +static void sqrt_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE; stream->parallel_for( @@ -433,7 +433,7 @@ void sqrt_f32_sycl(const float *x, float *dst, const int k, }); } -void sin_f32_sycl(const float *x, float *dst, const int k, +static void sin_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; stream->parallel_for( @@ -445,7 +445,7 @@ void sin_f32_sycl(const float *x, float *dst, const int k, }); } -void cos_f32_sycl(const float *x, float *dst, const int k, +static void cos_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE; stream->parallel_for( @@ -457,7 +457,7 @@ void cos_f32_sycl(const float *x, float *dst, const int k, }); } -void leaky_relu_f32_sycl(const float *x, float *dst, const int k, +static void leaky_relu_f32_sycl(const float *x, float *dst, const int k, const float negative_slope, queue_ptr stream) { const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE; @@ -470,7 +470,7 @@ void leaky_relu_f32_sycl(const float *x, float *dst, const int k, }); } -void sqr_f32_sycl(const float *x, float *dst, const int k, +static void sqr_f32_sycl(const float *x, float *dst, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE; stream->parallel_for( @@ -482,7 +482,7 @@ void sqr_f32_sycl(const float *x, float *dst, const int k, }); } -void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01, +static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, const int ne12, const int ne13, const float sf0, const float sf1, const float sf2, const float sf3, queue_ptr stream) { @@ -496,7 +496,7 @@ void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01 }); } -void pad_f32_sycl(const float *x, float *dst, const int ne00, +static void pad_f32_sycl(const float *x, float *dst, const int ne00, const int ne01, const int ne02, const int ne0, const int ne1, const int ne2, queue_ptr stream) { int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE; diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp index 51c19f6b3..b9cf8767c 100644 --- a/ggml/src/ggml-sycl/getrows.cpp +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -207,7 +207,7 @@ static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_te const size_t nrows = ne01; const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2); stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{ + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{ k_get_rows_reorder( src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); @@ -302,7 +302,6 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *s // TODO: k-quants GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); GGML_ABORT("fatal error"); - break; } } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 6977b705e..360e3f166 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -46,6 +46,7 @@ static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; int g_ggml_sycl_disable_optimize = 0; +int g_ggml_sycl_disable_graph = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -95,7 +96,7 @@ const ggml_sycl_device_info & ggml_sycl_info() { return info; } -void print_device_detail(int id, sycl::device &device, std::string device_type) { +static void print_device_detail(int id, sycl::device &device, std::string device_type) { dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR( @@ -118,7 +119,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) global_mem_size, device.get_info().c_str()); } -void print_device_opt_feature(int device_count) { +static void print_device_opt_feature(int device_count) { GGML_LOG_INFO("SYCL Optimization Feature:\n"); GGML_LOG_INFO( "|ID| Device Type|Reorder|\n"); @@ -191,10 +192,12 @@ static void ggml_check_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); + g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Running with Environment Variables:\n"); GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); + GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph); GGML_LOG_INFO("Build with Macros:\n"); #if defined(GGML_SYCL_FORCE_MMQ) GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); @@ -333,10 +336,11 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, assert(tensor->view_src->buffer->buft == buffer->buft); return GGML_STATUS_SUCCESS; } - - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; - tensor->extra = extra; - ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx. + if (tensor->type == GGML_TYPE_Q4_0) { + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + tensor->extra = extra; + ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx. + } if (ggml_is_quantized(tensor->type)) { // initialize padding to 0 to avoid possible NaN values @@ -400,7 +404,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, +static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { char *host_buf = (char *)malloc(size); q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); @@ -486,6 +490,22 @@ catch (sycl::exception const &exc) { std::exit(1); } +static void ggml_backend_sycl_buffer_reset(ggml_backend_buffer_t buffer) { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); + if (buffer == nullptr) { + return; + } + + ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; + + if (ctx != nullptr) { + for (ggml_tensor_extra_gpu * extra : ctx->tensor_extras) { + release_extra_gpu(extra); + } + ctx->tensor_extras.clear(); // reset the tensor_extras vector + } +} + static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer, /* .get_base = */ ggml_backend_sycl_buffer_get_base, @@ -495,7 +515,7 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor, /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor, /* .clear = */ ggml_backend_sycl_buffer_clear, - /* .reset = */ NULL, + /* .reset = */ ggml_backend_sycl_buffer_reset, }; // sycl buffer type @@ -576,7 +596,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { static std::mutex mutex; std::lock_guard lock(mutex); - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); auto dev_count = ggml_backend_sycl_get_device_count(); @@ -604,7 +623,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { return &ggml_backend_sycl_buffer_types[device]; } -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) { +static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) { GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); int device = ctx->device; @@ -1666,7 +1685,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, stream->parallel_for( sycl::nd_range<3>(num_blocks * block_size, block_size), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { quantize_q8_1(x, vy, kx, kx_padded, item_ct1); }); } @@ -1687,7 +1706,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y, item_ct1); }); @@ -1707,7 +1726,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y / nchannels_x, item_ct1); @@ -1748,7 +1767,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, const sycl::range<3> block_nums(1, nrows, 1); stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { k_sum_rows_f32(x, dst, ncols, item_ct1); }); } @@ -2680,6 +2699,12 @@ static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * ds GGML_SYCL_DEBUG("call %s done\n", __func__); } +static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_l2_norm); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm); @@ -2898,7 +2923,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { return false; } -bool ggml_sycl_supports_dmmv(enum ggml_type type) { +static bool ggml_sycl_supports_dmmv(enum ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -3113,8 +3138,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const int64_t i2 = i12; src0_row.data = src0_original + i02*nb02; - src1_row.data = src1_original + + i11*nb11 + i12*nb12; - dst_row.data = dst_original + i1*nb1 + i2*nb2; + src1_row.data = src1_original + i11*nb11 + i12*nb12; + dst_row.data = dst_original + i1*nb1 + i2*nb2; ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row); } @@ -3271,7 +3296,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } -void ggml_sycl_set_main_device(const int main_device) try { +static void ggml_sycl_set_main_device(const int main_device) try { if (dpct::get_current_device_id() == static_cast (main_device)) { return; } @@ -3292,7 +3317,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) { +static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) { if (!g_sycl_loaded) return false; if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) { @@ -3394,6 +3419,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_OP_RMS_NORM: ggml_sycl_rms_norm(ctx, dst); break; + case GGML_OP_L2_NORM: + ggml_sycl_l2_norm(ctx, dst); + break; case GGML_OP_MUL_MAT: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { return false; @@ -3471,6 +3499,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens case GGML_OP_RWKV_WKV6: ggml_sycl_op_rwkv_wkv6(ctx, dst); break; + case GGML_OP_RWKV_WKV7: + ggml_sycl_op_rwkv_wkv7(ctx, dst); + break; case GGML_OP_GATED_LINEAR_ATTN: ggml_sycl_op_gated_linear_attn(ctx, dst); break; @@ -3610,7 +3641,7 @@ catch (sycl::exception const &exc) { std::exit(1); } -void reorder_qw(char *data_device, const int ncols, const int nrows, +static void reorder_qw(char *data_device, const int ncols, const int nrows, size_t size, size_t offset, dpct::queue_ptr stream) { auto tmp_buf = sycl::malloc_shared(size, *stream); SYCL_CHECK( @@ -3624,7 +3655,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows, stream->parallel_for( size / sizeof(block_q4_0), - [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { const block_q4_0* x = (const block_q4_0*)tmp_buf; const int ib = i; @@ -3638,7 +3669,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows, sycl::free(tmp_buf, *stream); } -void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { +static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { char*data_device = (char*)src0->data; size_t ncols = src0->ne[0]; size_t nrows = src0->ne[1]; @@ -3647,7 +3678,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { reorder_qw(data_device, ncols, nrows, size, 0, stream); } -void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { +static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { ggml_tensor *src0 = dst->src[0]; ggml_tensor *src1 = dst->src[1]; @@ -3660,7 +3691,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { } } -void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { +static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { dpct::queue_ptr stream = ctx->stream(); if (ctx->optimized_graph) { return; @@ -3671,10 +3702,9 @@ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream); } } -static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - ggml_sycl_set_main_device(sycl_ctx->device); +static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) { + ggml_sycl_set_main_device(sycl_ctx->device); if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx); for (int i = 0; i < cgraph->n_nodes; i++) { @@ -3696,7 +3726,46 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_ } GGML_ASSERT(ok); } +} +static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + auto * sycl_ctx = static_cast(backend->context); + +#ifdef GGML_SYCL_GRAPH + if (!g_ggml_sycl_disable_graph) { + if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) { + GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device); + ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); + return GGML_STATUS_SUCCESS; + } + + sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream())); + model_sycl_graph.begin_recording(*(sycl_ctx->stream())); + ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); + model_sycl_graph.end_recording(); + + if (!sycl_ctx->exec_graph) { + auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); + sycl_ctx->exec_graph = std::make_unique< + sycl_ex::command_graph>(exec_graph); + } else { + try { + sycl_ctx->exec_graph->update(model_sycl_graph); + GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n"); + } catch (sycl::exception const & e) { + GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what()); + auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); + sycl_ctx->exec_graph = std::make_unique< + sycl_ex::command_graph>(exec_graph); + } + } + + sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph)); + } else +#endif + { + ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); + } return GGML_STATUS_SUCCESS; } @@ -3761,7 +3830,6 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) { } int ggml_backend_sycl_get_device_count() { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n"); return ggml_sycl_info().device_count; } @@ -3851,7 +3919,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return true; } return false; - } break; + } case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_NEG: @@ -3869,7 +3937,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g default: return false; } - break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { @@ -3900,7 +3967,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return false; } return true; - } break; + } case GGML_OP_OUT_PROD: return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; case GGML_OP_GET_ROWS: @@ -3917,7 +3984,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g default: return false; } - } break; + } case GGML_OP_CPY: { ggml_type src0_type = op->src[0]->type; @@ -3968,12 +4035,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return true; } return false; - } break; + } case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; - } break; + } case GGML_OP_DUP: case GGML_OP_ARGMAX: case GGML_OP_NONE: @@ -3997,6 +4064,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return (op->src[0]->type == GGML_TYPE_F32); case GGML_OP_NORM: case GGML_OP_RMS_NORM: + case GGML_OP_L2_NORM: case GGML_OP_GROUP_NORM: return ggml_is_contiguous(op->src[0]); case GGML_OP_SCALE: @@ -4030,6 +4098,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_LEAKY_RELU: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_RWKV_WKV6: + case GGML_OP_RWKV_WKV7: case GGML_OP_GATED_LINEAR_ATTN: return true; default: diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp index 8ea82c940..ffb272aa2 100644 --- a/ggml/src/ggml-sycl/mmq.cpp +++ b/ggml/src/ggml-sycl/mmq.cpp @@ -3017,7 +3017,6 @@ void ggml_sycl_op_mul_mat_q( break; default: GGML_ABORT("fatal error"); - break; } GGML_UNUSED(src1); diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index a96286d71..1b92ba2d6 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -495,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -519,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -543,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -567,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -591,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -615,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -639,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -663,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -687,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -711,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q( vx, vy, dst, ncols, nrows, item_ct1); @@ -734,7 +734,7 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq2_xxs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -755,7 +755,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq2_xs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -777,7 +777,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq2_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -799,7 +799,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq3_xxs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -821,7 +821,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq3_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -843,7 +843,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq1_s_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -864,7 +864,7 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq1_m_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -886,7 +886,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq4_nl_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -908,7 +908,7 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_iq4_xs_q8_1( vx, vy, dst, ncols, nrows, item_ct1); }); @@ -1003,7 +1003,6 @@ void ggml_sycl_op_mul_mat_vec_q( break; default: GGML_ABORT("fatal error"); - break; } } GGML_UNUSED(src1); diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 9cf2be155..d9678da8f 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -180,6 +180,50 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa } } +static void l2_norm_f32(const float* x, float* dst, const int ncols, const float eps, + const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + const int tid = item_ct1.get_local_id(2); + const int nthreads = item_ct1.get_local_range(2); + const int nwarps = nthreads / WARP_SIZE; + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row * ncols + col]; + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp, item_ct1); + if (block_size > WARP_SIZE) { + + int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; + int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + /* + DPCT1118:3: SYCL group functions and algorithms must be encountered in + converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + size_t nreduce = nwarps / WARP_SIZE; + tmp = 0.f; + for (size_t i = 0; i < nreduce; i += 1) + { + tmp += s_sum[lane_id + i * WARP_SIZE]; + } + tmp = warp_reduce_sum(tmp, item_ct1); + } + + const float scale = sycl::rsqrt(sycl::max(tmp, eps * eps)); + + for (int col = tid; col < ncols; col += block_size) { + dst[row * ncols + col] = scale * x[row * ncols + col]; + } +} + static void norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const float eps, queue_ptr stream, int device) { @@ -191,7 +235,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE); }); @@ -214,7 +258,7 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); }); @@ -233,7 +277,7 @@ static void group_norm_f32_sycl(const float* x, float* dst, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { group_norm_f32( x, dst, group_size, ne_elements, eps_ct4, item_ct1, nullptr, WARP_SIZE); @@ -260,7 +304,7 @@ static void group_norm_f32_sycl(const float* x, float* dst, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); @@ -281,7 +325,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { rms_norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE); }); @@ -303,7 +347,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { rms_norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); }); @@ -311,6 +355,48 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, } } +static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols, + const int nrows, const float eps, + queue_ptr stream, int device) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); + if (ncols < 1024) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + stream->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + l2_norm_f32(x, dst, ncols, eps, item_ct1, + nullptr, WARP_SIZE); + }); + }); + } + else { + const int work_group_size = ggml_sycl_info().max_work_group_sizes[device]; + assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0); + const sycl::range<3> block_dims(1, 1, work_group_size); + /* + DPCT1049:19: The work-group size passed to the SYCL kernel may exceed + the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if needed. + */ + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE), + cgh); + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, + block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { + l2_norm_f32(x, dst, ncols, eps, item_ct1, + get_pointer(s_sum_acc_ct1), work_group_size); + }); + }); + } +} + void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, ggml_tensor* dst, const float* src0_dd, const float* src1_dd, float* dst_dd, @@ -376,3 +462,25 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* sr (void)dst; (void)src1_dd; } + +void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + l2_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); + + (void)src1; + (void)dst; + (void)src1_dd; +} diff --git a/ggml/src/ggml-sycl/norm.hpp b/ggml/src/ggml-sycl/norm.hpp index a9ad9156f..11e91680c 100644 --- a/ggml/src/ggml-sycl/norm.hpp +++ b/ggml/src/ggml-sycl/norm.hpp @@ -32,4 +32,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* float* dst_dd, const queue_ptr& main_stream); +void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, + const ggml_tensor* src1, ggml_tensor* dst, + const float* src0_dd, const float* src1_dd, + float* dst_dd, + const queue_ptr& main_stream); + #endif // GGML_SYCL_NORM_HPP diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index eb20bd251..7563d9ced 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -132,7 +132,7 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { soft_max_f32(x, mask, dst, ncols_par, nrows_y, scale, max_bias, m0, m1, n_head_log2, item_ct1, diff --git a/ggml/src/ggml-sycl/wkv.cpp b/ggml/src/ggml-sycl/wkv.cpp new file mode 100644 index 000000000..540f6fbf5 --- /dev/null +++ b/ggml/src/ggml-sycl/wkv.cpp @@ -0,0 +1,305 @@ +#include +#include "wkv.hpp" + +constexpr int WKV_BLOCK_SIZE = 64; // Matching CUDA_WKV_BLOCK_SIZE + +// Helper function for the main kernel +template +static void rwkv_wkv6_f32_kernel( + const int B, const int T, const int C, const int H, + const float* k, const float* v, const float* r, + const float* tf, const float* td, const float* s, + float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) { + + const int tid = item_ct1.get_local_id(2); + const int bid = item_ct1.get_group(2); + + const int head_size = block_size; + const int batch_i = bid / H; + const int head_i = bid % H; + const int state_size = C * head_size; + const int n_seq_tokens = T / B; + + // Set up shared memory pointers + float* _k = shared_mem; + float* _r = _k + head_size; + float* _tf = _r + head_size; + float* _td = _tf + head_size; + + // Local state array + float state[block_size]; + + // Load initial state + #pragma unroll + for (int i = 0; i < head_size; i++) { + state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid]; + } + + // Sync threads before shared memory operations + item_ct1.barrier(sycl::access::fence_space::local_space); + + // Load time-mixing parameters + _tf[tid] = tf[head_i * head_size + tid]; + item_ct1.barrier(sycl::access::fence_space::local_space); + + // Main sequence processing loop + for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; + t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; + t += C) { + + item_ct1.barrier(sycl::access::fence_space::local_space); + + // Load current timestep data to shared memory + _k[tid] = k[t]; + _r[tid] = r[t]; + _td[tid] = td[t]; + + item_ct1.barrier(sycl::access::fence_space::local_space); + + const float _v = v[t]; + float y = 0; + + // Process in chunks of 4 for better vectorization + sycl::float4 k4, r4, tf4, td4, s4; + #pragma unroll + for (int j = 0; j < head_size; j += 4) { + // Load data in vec4 chunks + k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + tf4 = sycl::float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]); + td4 = sycl::float4(_td[j], _td[j+1], _td[j+2], _td[j+3]); + s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]); + + // Compute key-value product + sycl::float4 kv4 = k4 * _v; + + // Accumulate weighted sum + y += sycl::dot(r4, tf4 * kv4 + s4); + + // Update state + s4 = s4 * td4 + kv4; + + // Store updated state + state[j] = s4.x(); + state[j+1] = s4.y(); + state[j+2] = s4.z(); + state[j+3] = s4.w(); + } + + dst[t] = y; + } + + // Save final state + #pragma unroll + for (int i = 0; i < head_size; i++) { + dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; + } +} + +template +static void rwkv_wkv7_f32_kernel( + const int B, const int T, const int C, const int H, + const float* r, const float* w, const float* k, const float* v, + const float* a, const float* b, const float* s, + float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) { + + const int tid = item_ct1.get_local_id(2); + const int bid = item_ct1.get_group(2); + + const int head_size = block_size; + const int batch_i = bid / H; + const int head_i = bid % H; + const int state_size = C * head_size; + const int n_seq_tokens = T / B; + + float* _r = shared_mem; + float* _w = _r + head_size; + float* _k = _w + head_size; + float* _a = _k + head_size; + float* _b = _a + head_size; + + float state[block_size]; + + #pragma unroll + for (int i = 0; i < head_size; i++) { + state[i] = s[batch_i * state_size + head_i * head_size * head_size + tid * head_size + i]; + } + + for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; + t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; + t += C) { + + item_ct1.barrier(sycl::access::fence_space::local_space); + + _r[tid] = r[t]; + _w[tid] = w[t]; + _k[tid] = k[t]; + _a[tid] = a[t]; + _b[tid] = b[t]; + + item_ct1.barrier(sycl::access::fence_space::local_space); + + const float _v = v[t]; + float y = 0, sa = 0; + sycl::float4 a4, s4; + + #pragma unroll + for (int j = 0; j < head_size; j += 4) { + a4 = sycl::float4(_a[j], _a[j+1], _a[j+2], _a[j+3]); + s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]); + sa += sycl::dot(a4, s4); + } + + sycl::float4 r4, w4, k4, b4; + #pragma unroll + for (int j = 0; j < head_size; j += 4) { + r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + w4 = sycl::float4(_w[j], _w[j+1], _w[j+2], _w[j+3]); + k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + b4 = sycl::float4(_b[j], _b[j+1], _b[j+2], _b[j+3]); + s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]); + + sycl::float4 kv4 = k4 * _v; + + s4 = s4 * w4 + kv4 + sa * b4; + y += sycl::dot(r4, s4); + + state[j] = s4.x(); + state[j+1] = s4.y(); + state[j+2] = s4.z(); + state[j+3] = s4.w(); + } + + dst[t] = y; + } + + #pragma unroll + for (int i = 0; i < head_size; i++) { + dst[T * C + batch_i * state_size + head_i * head_size * head_size + tid * head_size + i] = state[i]; + } +} + +void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { + + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; + + const float* k_d = (const float*)dst->src[0]->data; + const float* v_d = (const float*)dst->src[1]->data; + const float* r_d = (const float*)dst->src[2]->data; + const float* tf_d = (const float*)dst->src[3]->data; + const float* td_d = (const float*)dst->src[4]->data; + const float* s_d = (const float*)dst->src[5]->data; + float* dst_d = (float*)dst->data; + + const int64_t B = dst->src[5]->ne[1]; + const int64_t T = dst->src[0]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[1]; + + GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == WKV_BLOCK_SIZE || C / H == WKV_BLOCK_SIZE * 2); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64 + + dpct::queue_ptr stream = ctx.stream(); + + // Calculate execution configuration + const size_t shared_mem_size = C / H * 4 * sizeof(float); // For k, r, tf, td + sycl::range<3> block_dims(1, 1, C / H); + sycl::range<3> grid_dims(1, 1, B * H); + + // Submit kernel + if (C / H == WKV_BLOCK_SIZE) { + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rwkv_wkv6_f32_kernel( + B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d, + item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() + ); + }); + }); + } else { + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rwkv_wkv6_f32_kernel( + B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d, + item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() + ); + }); + }); + } + + GGML_UNUSED(src0); + GGML_UNUSED(src1); +} + +void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { + + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; + + const float* r_d = (const float*)dst->src[0]->data; + const float* w_d = (const float*)dst->src[1]->data; + const float* k_d = (const float*)dst->src[2]->data; + const float* v_d = (const float*)dst->src[3]->data; + const float* a_d = (const float*)dst->src[4]->data; + const float* b_d = (const float*)dst->src[5]->data; + const float* s_d = (const float*)dst->src[6]->data; + float* dst_d = (float*)dst->data; + + const int64_t B = dst->src[6]->ne[1]; + const int64_t T = dst->src[0]->ne[2]; + const int64_t C = dst->ne[0]; + const int64_t H = dst->src[0]->ne[1]; + + GGML_ASSERT(dst->src[6]->type == GGML_TYPE_F32); + GGML_ASSERT(C % H == 0); + GGML_ASSERT(C / H == WKV_BLOCK_SIZE || C / H == WKV_BLOCK_SIZE * 2); + + dpct::queue_ptr stream = ctx.stream(); + + // Calculate execution configuration + const size_t shared_mem_size = C / H * 5 * sizeof(float); // For r, w, k, a, b + sycl::range<3> block_dims(1, 1, C / H); + sycl::range<3> grid_dims(1, 1, B * H); + + // Submit kernel + if (C / H == WKV_BLOCK_SIZE) { + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rwkv_wkv7_f32_kernel( + B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d, + item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() + ); + }); + }); + } else { + stream->submit([&](sycl::handler& cgh) { + sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); + + cgh.parallel_for( + sycl::nd_range<3>(grid_dims * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rwkv_wkv7_f32_kernel( + B, T, C, H, r_d, w_d, k_d, v_d, a_d, b_d, s_d, dst_d, + item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() + ); + }); + }); + } + + GGML_UNUSED(src0); + GGML_UNUSED(src1); +} diff --git a/ggml/src/ggml-sycl/wkv.hpp b/ggml/src/ggml-sycl/wkv.hpp new file mode 100644 index 000000000..9f34a1001 --- /dev/null +++ b/ggml/src/ggml-sycl/wkv.hpp @@ -0,0 +1,10 @@ +#ifndef GGML_SYCL_WKV_HPP +#define GGML_SYCL_WKV_HPP + +#include "common.hpp" + +void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_WKV_HPP diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp deleted file mode 100644 index b54c20964..000000000 --- a/ggml/src/ggml-sycl/wkv6.cpp +++ /dev/null @@ -1,143 +0,0 @@ -#include -#include "wkv6.hpp" - -constexpr int WKV_BLOCK_SIZE = 64; // Matching CUDA_WKV_BLOCK_SIZE - -// Helper function for the main kernel -static void rwkv_wkv_f32_kernel( - const int B, const int T, const int C, const int H, - const float* k, const float* v, const float* r, - const float* tf, const float* td, const float* s, - float* dst, const sycl::nd_item<3>& item_ct1, float* shared_mem) { - - const int tid = item_ct1.get_local_id(2); - const int bid = item_ct1.get_group(2); - - const int head_size = WKV_BLOCK_SIZE; - const int batch_i = bid / H; - const int head_i = bid % H; - const int state_size = C * head_size; - const int n_seq_tokens = T / B; - - // Set up shared memory pointers - float* _k = shared_mem; - float* _r = _k + head_size; - float* _tf = _r + head_size; - float* _td = _tf + head_size; - - // Local state array - float state[WKV_BLOCK_SIZE]; - - // Load initial state - #pragma unroll - for (int i = 0; i < head_size; i++) { - state[i] = s[batch_i * state_size + head_i * head_size * head_size + i * head_size + tid]; - } - - // Sync threads before shared memory operations - item_ct1.barrier(sycl::access::fence_space::local_space); - - // Load time-mixing parameters - _tf[tid] = tf[head_i * head_size + tid]; - item_ct1.barrier(sycl::access::fence_space::local_space); - - // Main sequence processing loop - for (int t = batch_i * n_seq_tokens * C + head_i * head_size + tid; - t < (batch_i + 1) * n_seq_tokens * C + head_i * head_size + tid; - t += C) { - - item_ct1.barrier(sycl::access::fence_space::local_space); - - // Load current timestep data to shared memory - _k[tid] = k[t]; - _r[tid] = r[t]; - _td[tid] = td[t]; - - item_ct1.barrier(sycl::access::fence_space::local_space); - - const float _v = v[t]; - float y = 0; - - // Process in chunks of 4 for better vectorization - sycl::float4 k4, r4, tf4, td4, s4; - #pragma unroll - for (int j = 0; j < head_size; j += 4) { - // Load data in vec4 chunks - k4 = sycl::float4(_k[j], _k[j+1], _k[j+2], _k[j+3]); - r4 = sycl::float4(_r[j], _r[j+1], _r[j+2], _r[j+3]); - tf4 = sycl::float4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]); - td4 = sycl::float4(_td[j], _td[j+1], _td[j+2], _td[j+3]); - s4 = sycl::float4(state[j], state[j+1], state[j+2], state[j+3]); - - // Compute key-value product - sycl::float4 kv4 = k4 * _v; - - // Accumulate weighted sum - y += sycl::dot(r4, tf4 * kv4 + s4); - - // Update state - s4 = s4 * td4 + kv4; - - // Store updated state - state[j] = s4.x(); - state[j+1] = s4.y(); - state[j+2] = s4.z(); - state[j+3] = s4.w(); - } - - dst[t] = y; - } - - // Save final state - #pragma unroll - for (int i = 0; i < head_size; i++) { - dst[T * C + batch_i * state_size + head_i * head_size * head_size + i * head_size + tid] = state[i]; - } -} - -void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { - - const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; - - const float* k_d = (const float*)dst->src[0]->data; - const float* v_d = (const float*)dst->src[1]->data; - const float* r_d = (const float*)dst->src[2]->data; - const float* tf_d = (const float*)dst->src[3]->data; - const float* td_d = (const float*)dst->src[4]->data; - const float* s_d = (const float*)dst->src[5]->data; - float* dst_d = (float*)dst->data; - - const int64_t B = dst->src[5]->ne[1]; - const int64_t T = dst->src[0]->ne[2]; - const int64_t C = dst->ne[0]; - const int64_t H = dst->src[0]->ne[1]; - - GGML_ASSERT(dst->src[5]->type == GGML_TYPE_F32); - GGML_ASSERT(C % H == 0); - GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64 - - dpct::queue_ptr stream = ctx.stream(); - - // Calculate execution configuration - const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td - sycl::range<3> block_dims(1, 1, C / H); - sycl::range<3> grid_dims(1, 1, B * H); - - // Submit kernel - stream->submit([&](sycl::handler& cgh) { - sycl::local_accessor shared_mem_acc(shared_mem_size, cgh); - - cgh.parallel_for( - sycl::nd_range<3>(grid_dims * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rwkv_wkv_f32_kernel( - B, T, C, H, k_d, v_d, r_d, tf_d, td_d, s_d, dst_d, - item_ct1, (float*)shared_mem_acc.get_multi_ptr().get() - ); - }); - }); - - GGML_UNUSED(src0); - GGML_UNUSED(src1); -} diff --git a/ggml/src/ggml-sycl/wkv6.hpp b/ggml/src/ggml-sycl/wkv6.hpp deleted file mode 100644 index 8c596a997..000000000 --- a/ggml/src/ggml-sycl/wkv6.hpp +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef GGML_SYCL_WKV6_HPP -#define GGML_SYCL_WKV6_HPP - -#include "common.hpp" - -void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - - -#endif // GGML_SYCL_WKV6_HPP diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ff53bdfbe..dd680aa52 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -29,6 +29,7 @@ #include "ggml-vulkan-shaders.hpp" +#define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1)) #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) #define VK_VENDOR_ID_AMD 0x1002 @@ -149,6 +150,66 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf); static constexpr uint32_t mul_mat_vec_max_cols = 8; +enum vk_device_architecture { + OTHER, + AMD_GCN, + AMD_RDNA1, + AMD_RDNA2, + AMD_RDNA3, +}; + +static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { + vk::PhysicalDeviceProperties props = device.getProperties(); + + if (props.vendorID == VK_VENDOR_ID_AMD) { + const std::vector ext_props = device.enumerateDeviceExtensionProperties(); + + bool amd_shader_core_properties = false; + bool integer_dot_product = false; + bool subgroup_size_control = false; + + for (const auto& properties : ext_props) { + if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) { + amd_shader_core_properties = true; + } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) { + integer_dot_product = true; + } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { + subgroup_size_control = true; + } + } + + if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) { + return vk_device_architecture::OTHER; + } + + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd; + vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props; + vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + + props2.pNext = &shader_core_props_amd; + shader_core_props_amd.pNext = &integer_dot_props; + integer_dot_props.pNext = &subgroup_size_control_props; + + device.getProperties2(&props2); + + if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) { + return vk_device_architecture::AMD_GCN; + } + if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) { + // RDNA + if (shader_core_props_amd.wavefrontsPerSimd == 20) { + return vk_device_architecture::AMD_RDNA1; + } + if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) { + return vk_device_architecture::AMD_RDNA3; + } + return vk_device_architecture::AMD_RDNA2; + } + } + return vk_device_architecture::OTHER; +} + struct vk_device_struct { std::mutex mutex; @@ -161,6 +222,7 @@ struct vk_device_struct { bool pipeline_robustness; vk::Device device; uint32_t vendor_id; + vk_device_architecture architecture; vk_queue compute_queue; vk_queue transfer_queue; bool single_queue; @@ -242,6 +304,7 @@ struct vk_device_struct { vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; vk_pipeline pipeline_rms_norm_back_f32; + vk_pipeline pipeline_l2_norm_f32; vk_pipeline pipeline_gelu_f32; vk_pipeline pipeline_gelu_quick_f32; vk_pipeline pipeline_silu_f32; @@ -266,6 +329,7 @@ struct vk_device_struct { vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_pool2d_f32; vk_pipeline pipeline_rwkv_wkv6_f32; + vk_pipeline pipeline_rwkv_wkv7_f32; vk_pipeline pipeline_opt_step_adamw_f32; // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} @@ -368,6 +432,7 @@ struct vk_mat_mat_push_constants { uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; uint32_t k_split; uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; + uint32_t padded_N; }; struct vk_mat_vec_push_constants { uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; @@ -380,6 +445,7 @@ struct vk_mat_mat_id_push_constants { uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11; + uint32_t padded_N; }; struct vk_mat_vec_id_push_constants { uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; @@ -565,6 +631,13 @@ struct vk_op_rwkv_wkv6_push_constants { uint32_t H; }; +struct vk_op_rwkv_wkv7_push_constants { + uint32_t B; + uint32_t T; + uint32_t C; + uint32_t H; +}; + // Allow pre-recording command buffers struct vk_staging_memcpy { vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} @@ -1445,6 +1518,73 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec return supported; } +struct GpuPipelineConfig { + // GPU architecture identifier. + // Example: vk_device_architecture::AMD_GCN + vk_device_architecture arch; + + // Mapping of pipeline names to their specific subgroup sizes. + // Example: {"soft_max_f32", 64} + std::unordered_map pipelines; + + // Default subgroup size for this GPU. + // Defaults to 0 if not explicitly provided. + uint32_t default_subgroup_size = 0; +}; + +// Pipeline configuration for RDNA1 GPUs. +static const std::unordered_map rdna1_pipelines = { + {"soft_max", 64}, {"im2col", 64}, + {"argmax", 64}, {"mul_mat_vec", 64}, + {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32} +}; + +// Pipeline configuration for RDNA2 GPUs. +static const std::unordered_map rdna2_pipelines = { + {"soft_max", 64}, {"im2col", 64}, +}; + +static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32; + +// Define configurations for different GPUs. +static std::vector gpu_pipeline_configs = { + { + vk_device_architecture::AMD_RDNA1, + { + rdna1_pipelines, + }, + RDNA_DEFAULT_SUBGROUP_SIZE + }, + { + vk_device_architecture::AMD_RDNA2, + { + rdna2_pipelines, + }, + RDNA_DEFAULT_SUBGROUP_SIZE + }, +}; + +static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) { + for (const auto &config : gpu_pipeline_configs) { + if (config.arch == arch) { + auto pipIt = config.pipelines.find(pipeline_name); + if (pipIt != config.pipelines.end()) { + return pipIt->second; + } + std::vector> sorted_pipelines(config.pipelines.begin(), config.pipelines.end()); + std::sort(sorted_pipelines.begin(), sorted_pipelines.end(), + [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); }); + for (const auto &entry : sorted_pipelines) { + if (pipeline_name.find(entry.first) != std::string::npos) { + return entry.second; + } + } + return config.default_subgroup_size; + } + } + return 0; // If no matching configuration is found +} + static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); @@ -1466,36 +1606,36 @@ static void ggml_vk_load_shaders(vk_device& device) { uint32_t l_align, m_align, s_align; if (device->coopmat2) { // spec constants and tile sizes for non-quant matmul/matmul_id - l_warptile = { 256, 128, 256, 64 }; - m_warptile = { 256, 128, 128, 64 }; - s_warptile = { 128, 64, 64, 64 }; + l_warptile = { 256, 128, 256, 64, 1 }; + m_warptile = { 256, 128, 128, 64, 0 }; + s_warptile = { 128, 64, 64, 64, 0 }; l_wg_denoms = {128, 256, 1 }; m_wg_denoms = {128, 128, 1 }; s_wg_denoms = { 64, 64, 1 }; // spec constants and tile sizes for quant matmul (non-Qi_K) - l_warptile_mmq = { 256, 128, 256, 64 }; - m_warptile_mmq = { 256, 128, 128, 64 }; - s_warptile_mmq = { 256, 128, 128, 64 }; + l_warptile_mmq = { 256, 128, 256, 64, 1 }; + m_warptile_mmq = { 256, 128, 128, 64, 1 }; + s_warptile_mmq = { 256, 32, 64, 128, 0 }; l_mmq_wg_denoms = { 128, 256, 1 }; m_mmq_wg_denoms = { 128, 128, 1 }; - s_mmq_wg_denoms = { 128, 128, 1 }; + s_mmq_wg_denoms = { 32, 64, 1 }; // spec constants and tile sizes for quant matmul (Qi_K) - l_warptile_mmq_k = { 256, 128, 512, 16 }; - m_warptile_mmq_k = { 256, 128, 256, 16 }; - s_warptile_mmq_k = { 256, 32, 128, 64 }; - l_mmq_wg_denoms_k = { 128, 512, 1 }; - m_mmq_wg_denoms_k = { 128, 256, 1 }; - s_mmq_wg_denoms_k = { 32, 128, 1 }; + l_warptile_mmq_k = { 256, 64, 128, 64, 1 }; + m_warptile_mmq_k = { 256, 32, 64, 64, 0 }; + s_warptile_mmq_k = { 256, 32, 32, 128, 0 }; + l_mmq_wg_denoms_k = { 64, 128, 1 }; + m_mmq_wg_denoms_k = { 32, 64, 1 }; + s_mmq_wg_denoms_k = { 32, 32, 1 }; // spec constants and tile sizes for quant matmul_id - l_warptile_mmqid = { 256, 128, 128, 16 }; - m_warptile_mmqid = { 256, 128, 64, 16 }; - s_warptile_mmqid = { 256, 64, 64, 16 }; - l_mmqid_wg_denoms = { 128, 128, 1 }; + l_warptile_mmqid = { 256, 128, 64, 16, 0 }; + m_warptile_mmqid = { 256, 128, 64, 16, 0 }; + s_warptile_mmqid = { 256, 128, 64, 16, 0 }; + l_mmqid_wg_denoms = { 128, 64, 1 }; m_mmqid_wg_denoms = { 128, 64, 1 }; - s_mmqid_wg_denoms = { 64, 64, 1 }; + s_mmqid_wg_denoms = { 128, 64, 1 }; l_align = 128; m_align = 64; @@ -1571,6 +1711,10 @@ static void ggml_vk_load_shaders(vk_device& device) { uint32_t parameter_count, uint32_t push_constant_size, std::array wg_denoms, const std::vector& specialization_constants, uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) { + if (!require_full_subgroups && required_subgroup_size == 0) { + required_subgroup_size = get_subgroup_size(name, device->architecture); + } + if (!pipeline) { pipeline = std::make_shared(); pipeline->name = name; @@ -2128,6 +2272,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -2239,6 +2384,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); + ggml_vk_create_pipeline(device, device->pipeline_opt_step_adamw_f32, "opt_step_adamw_f32", opt_step_adamw_f32_len, opt_step_adamw_f32_data, "main", 5, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); for (auto &c : compiles) { @@ -2247,7 +2394,7 @@ static void ggml_vk_load_shaders(vk_device& device) { device->need_compiles = false; } -static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props); +static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch); static vk_device ggml_vk_get_device(size_t idx) { VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")"); @@ -2276,6 +2423,8 @@ static vk_device ggml_vk_get_device(size_t idx) { device->physical_device = physical_devices[dev_num]; const std::vector ext_props = device->physical_device.enumerateDeviceExtensionProperties(); + device->architecture = get_device_architecture(device->physical_device); + const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY"); device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr; @@ -2288,7 +2437,6 @@ static vk_device ggml_vk_get_device(size_t idx) { bool coopmat2_support = false; device->coopmat_support = false; - // Check if maintenance4 is supported for (const auto& properties : ext_props) { if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { maintenance4_support = true; @@ -2376,13 +2524,9 @@ static vk_device ggml_vk_get_device(size_t idx) { if (GGML_VK_SUBALLOCATION_BLOCK_SIZE != nullptr) { device->suballocation_block_size = std::stoul(GGML_VK_SUBALLOCATION_BLOCK_SIZE); -#if defined(_WIN32) - } else if (device->vendor_id == VK_VENDOR_ID_NVIDIA) { + } else { // Limit batching of allocations to 1GB by default to avoid fragmentation issues device->suballocation_block_size = 1024*1024*1024; -#endif - } else { - device->suballocation_block_size = device->max_memory_allocation_size; } device->suballocation_block_size = std::min(device->suballocation_block_size, device->max_memory_allocation_size); @@ -2401,7 +2545,7 @@ static vk_device ggml_vk_get_device(size_t idx) { device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; - if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) { + if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture)) { device->coopmat_support = false; } @@ -2779,7 +2923,10 @@ static void ggml_vk_print_gpu_info(size_t idx) { subgroup_props.pNext = &driver_props; physical_device.getProperties2(&props2); - const size_t subgroup_size = subgroup_props.subgroupSize; + vk_device_architecture arch = get_device_architecture(physical_device); + uint32_t default_subgroup_size = get_subgroup_size("", arch); + const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize; + const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; bool fp16_storage = false; @@ -2805,7 +2952,9 @@ static void ggml_vk_print_gpu_info(size_t idx) { } } - if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) { + const vk_device_architecture device_architecture = get_device_architecture(physical_device); + + if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture)) { coopmat_support = false; } @@ -3850,10 +3999,14 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")"); if (ctx->device->coopmat2) { - if ((ctx->device->mul_mat_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) { + // Use large shader when the N dimension is greater than the medium shader's tile size + uint32_t crossover_large = mmp->m->wg_denoms[1]; + if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) { return aligned ? mmp->a_l : mmp->l; } - if ((ctx->device->mul_mat_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_s[src0_type]) { + // Use medium shader when the N dimension is greater than the small shader's tile size + uint32_t crossover_medium = mmp->s->wg_denoms[1]; + if ((ctx->device->mul_mat_m[src0_type] && (n > crossover_medium)) || !ctx->device->mul_mat_s[src0_type]) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_s : mmp->s; @@ -3878,18 +4031,19 @@ static void ggml_vk_matmul( vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, - uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3) { + uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, + uint32_t padded_n) { VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ")"); ggml_vk_sync_buffers(subctx); if (split_k == 1) { - const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3 }; + const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch }); return; } GGML_ASSERT(batch_stride_d == m * n); - const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3 }; + const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n }; // Make sure enough workgroups get assigned for split k to work ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); ggml_vk_sync_buffers(subctx); @@ -3898,13 +4052,17 @@ static void ggml_vk_matmul( } static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned, ggml_type src0_type) { - VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")"); + VK_LOG_DEBUG("ggml_vk_guess_matmul_id_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ")"); if (ctx->device->coopmat2) { - if ((ctx->device->mul_mat_id_l[src0_type] && (m % mmp->l->wg_denoms[0]) == 0 && (n % mmp->l->wg_denoms[1]) == 0) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) { + // Use large shader when the N dimension is greater than the medium shader's tile size + uint32_t crossover_large = mmp->m->wg_denoms[1]; + if ((ctx->device->mul_mat_id_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_id_m[src0_type] && !ctx->device->mul_mat_id_s[src0_type])) { return aligned ? mmp->a_l : mmp->l; } - if ((ctx->device->mul_mat_id_m[src0_type] && (m % mmp->m->wg_denoms[0]) == 0 && (n % mmp->m->wg_denoms[1]) == 0) || !ctx->device->mul_mat_id_s[src0_type]) { + // Use medium shader when the N dimension is greater than the small shader's tile size + uint32_t crossover_medium = mmp->s->wg_denoms[1]; + if ((ctx->device->mul_mat_id_m[src0_type] && (n > crossover_medium)) || !ctx->device->mul_mat_id_s[src0_type]) { return aligned ? mmp->a_m : mmp->m; } return aligned ? mmp->a_s : mmp->s; @@ -3929,14 +4087,15 @@ static void ggml_vk_matmul_id( vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& ids, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, - uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11) { + uint32_t n_as, uint32_t nei0, uint32_t nei1, uint32_t nbi1, uint32_t ne11, + uint32_t padded_n) { VK_LOG_DEBUG("ggml_vk_matmul_id(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), ids: (" << ids.buffer->buffer << ", " << ids.offset << ", " << ids.size << "), " << "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " << "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " << "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"); ggml_vk_sync_buffers(subctx); const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, - nei0, nei1, nbi1, ne11 }; + nei0, nei1, nbi1, ne11, padded_n }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, sizeof(vk_mat_mat_id_push_constants), &pc, { m, nei1, n_as }); } @@ -4098,15 +4257,17 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub // Not implemented GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11, qx_needs_dequant ? GGML_TYPE_F16 : src0->type)); const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type); + // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking + uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11; + const int x_ne = ne01 * ne00; + const int y_ne = padded_n * ne10; + const int d_ne = ne11 * ne01; + const uint32_t split_k = ggml_vk_guess_split_k(ctx, ne01, ne11, ne10, pipeline); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); @@ -4229,7 +4390,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21, - split_k, ne12*ne13, ne02, ne12, r2, r3 + split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n ); // NOLINT } @@ -4680,15 +4841,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& // Not implemented GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT - const uint64_t x_ne = ne01 * ne00; - const uint64_t y_ne = ne11 * ne10; - const uint64_t d_ne = ne21 * ne20; - const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_id_pipeline_align(ctx, mmp, ne01, nei1, qx_needs_dequant ? GGML_TYPE_F16 : src0->type)); const bool aligned = ne10 == kpad && ne01 > 8 && nei1 > 8; vk_pipeline pipeline = ggml_vk_guess_matmul_id_pipeline(ctx, mmp, ne01, nei1, aligned, qx_needs_dequant ? GGML_TYPE_F16 : src0->type); + // Reserve extra storage in the N dimension for the Y matrix, so we can avoid bounds-checking + uint32_t padded_n = qy_needs_dequant ? ROUNDUP_POW2(ne11, pipeline->wg_denoms[1]) :ne11; + const uint64_t x_ne = ne01 * ne00; + const uint64_t y_ne = padded_n * ne10; + const uint64_t d_ne = ne21 * ne20; + const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; @@ -4807,7 +4970,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& { d_D, d_buf_offset, d_sz * ne22 * ne23 }, { d_ids, ids_buf_offset, ids_sz }, ne01, ne21, ne10, ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21, - n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11 + n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n ); // NOLINT } @@ -5318,6 +5481,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_rms_norm_back_f32; } return nullptr; + case GGML_OP_L2_NORM: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_l2_norm_f32; + } + return nullptr; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_SILU: @@ -5457,6 +5625,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_rwkv_wkv6_f32; } return nullptr; + case GGML_OP_RWKV_WKV7: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rwkv_wkv7_f32; + } + return nullptr; case GGML_OP_OPT_STEP_ADAMW: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_opt_step_adamw_f32; @@ -5704,6 +5877,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: + case GGML_OP_L2_NORM: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_SUM_ROWS: @@ -5953,23 +6127,17 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const }, dryrun); } -static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, bool dryrun = false) { - const ggml_tensor * k = dst->src[0]; - const ggml_tensor * v = dst->src[1]; - const ggml_tensor * r = dst->src[2]; - const ggml_tensor * tf = dst->src[3]; - const ggml_tensor * td = dst->src[4]; - const ggml_tensor * state = dst->src[5]; +static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, int version, bool dryrun = false) { + GGML_ASSERT(version == 6 || version == 7); + int num_srcs = version == 6 ? 6 : 7; + + for (int i = 0; i < num_srcs; i++) { + GGML_ASSERT(!ggml_is_quantized(dst->src[i]->type)); + } - GGML_ASSERT(!ggml_is_quantized(k->type)); - GGML_ASSERT(!ggml_is_quantized(v->type)); - GGML_ASSERT(!ggml_is_quantized(r->type)); - GGML_ASSERT(!ggml_is_quantized(tf->type)); - GGML_ASSERT(!ggml_is_quantized(td->type)); - GGML_ASSERT(!ggml_is_quantized(state->type)); GGML_ASSERT(dst->buffer != nullptr); - vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, k, v, r, dst, GGML_OP_RWKV_WKV6); + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, dst->src[0], dst->src[1], dst->src[2], dst, dst->op); GGML_ASSERT(pipeline != nullptr); if (dryrun) { @@ -5978,89 +6146,73 @@ static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subc } ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; - ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context; - ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context; - ggml_backend_vk_buffer_context * r_buf_ctx = (ggml_backend_vk_buffer_context *)r->buffer->context; - ggml_backend_vk_buffer_context * tf_buf_ctx = (ggml_backend_vk_buffer_context *)tf->buffer->context; - ggml_backend_vk_buffer_context * td_buf_ctx = (ggml_backend_vk_buffer_context *)td->buffer->context; - ggml_backend_vk_buffer_context * state_buf_ctx = (ggml_backend_vk_buffer_context *)state->buffer->context; + ggml_backend_vk_buffer_context * src_buf_ctxs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr }; + for (int i = 0; i < num_srcs; i++) { + src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context; + } ggml_vk_sync_buffers(subctx); - vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr; - size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0; - bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false; + vk_buffer d_D = nullptr, d_srcs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr }; + size_t dst_offset = 0, src_offsets[7] = { 0, 0, 0, 0, 0, 0, 0 }; + bool dst_uma = false, srcs_uma[7] = { false, false, false, false, false, false, false }; if (ctx->device->uma) { - ggml_vk_host_get(ctx->device, k->data, d_K, k_offset); - ggml_vk_host_get(ctx->device, v->data, d_V, v_offset); - ggml_vk_host_get(ctx->device, r->data, d_R, r_offset); - ggml_vk_host_get(ctx->device, tf->data, d_TF, tf_offset); - ggml_vk_host_get(ctx->device, td->data, d_TD, td_offset); - ggml_vk_host_get(ctx->device, state->data, d_State, state_offset); + for (int i = 0; i < num_srcs; i++) { + ggml_vk_host_get(ctx->device, dst->src[i]->data, d_srcs[i], src_offsets[i]); + srcs_uma[i] = d_srcs[i] != nullptr; + } + ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset); - - K_uma = d_K != nullptr; - V_uma = d_V != nullptr; - R_uma = d_R != nullptr; - TF_uma = d_TF != nullptr; - TD_uma = d_TD != nullptr; - STATE_uma = d_State != nullptr; - DST_uma = d_D != nullptr; + dst_uma = d_D != nullptr; } - if (!K_uma) { - d_K = k_buf_ctx->dev_buffer; - k_offset = vk_tensor_offset(k) + k->view_offs; + uint64_t src_sizes[7] = { 0, 0, 0, 0, 0, 0, 0 }; + for (int i = 0; i < num_srcs; i++) { + src_sizes[i] = ggml_nbytes(dst->src[i]); + if (!srcs_uma[i]) { + d_srcs[i] = src_buf_ctxs[i]->dev_buffer; + src_offsets[i] = vk_tensor_offset(dst->src[i]) + dst->src[i]->view_offs; + } } - if (!V_uma) { - d_V = v_buf_ctx->dev_buffer; - v_offset = vk_tensor_offset(v) + v->view_offs; - } - if (!R_uma) { - d_R = r_buf_ctx->dev_buffer; - r_offset = vk_tensor_offset(r) + r->view_offs; - } - if (!TF_uma) { - d_TF = tf_buf_ctx->dev_buffer; - tf_offset = vk_tensor_offset(tf) + tf->view_offs; - } - if (!TD_uma) { - d_TD = td_buf_ctx->dev_buffer; - td_offset = vk_tensor_offset(td) + td->view_offs; - } - if (!STATE_uma) { - d_State = state_buf_ctx->dev_buffer; - state_offset = vk_tensor_offset(state) + state->view_offs; - } - if (!DST_uma) { + + const uint64_t dst_size = ggml_nbytes(dst); + if (!dst_uma) { d_D = dst_buf_ctx->dev_buffer; dst_offset = vk_tensor_offset(dst) + dst->view_offs; } - const uint64_t k_size = ggml_nbytes(k); - const uint64_t v_size = ggml_nbytes(v); - const uint64_t r_size = ggml_nbytes(r); - const uint64_t tf_size = ggml_nbytes(tf); - const uint64_t td_size = ggml_nbytes(td); - const uint64_t state_size = ggml_nbytes(state); - const uint64_t dst_size = ggml_nbytes(dst); - std::array elements = { (uint32_t)(pc.B * pc.H), 1, 1 }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { - vk_subbuffer{ d_K, k_offset, k_size }, - vk_subbuffer{ d_V, v_offset, v_size }, - vk_subbuffer{ d_R, r_offset, r_size }, - vk_subbuffer{ d_TF, tf_offset, tf_size }, - vk_subbuffer{ d_TD, td_offset, td_size }, - vk_subbuffer{ d_State, state_offset, state_size }, - vk_subbuffer{ d_D, dst_offset, dst_size } - }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements); + if (version == 6) { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { + vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] }, + vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] }, + vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] }, + vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] }, + vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] }, + vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] }, + vk_subbuffer{ d_D, dst_offset, dst_size } + }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements); + } else if (version == 7) { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { + vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] }, + vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] }, + vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] }, + vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] }, + vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] }, + vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] }, + vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] }, + vk_subbuffer{ d_D, dst_offset, dst_size } + }, sizeof(vk_op_rwkv_wkv7_push_constants), &pc, elements); + } else { + // shouldn't happen + GGML_ASSERT(false); + } } static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { @@ -6069,7 +6221,7 @@ static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, const size_t n_heads = dst->src[0]->ne[1]; const size_t n_seqs = dst->src[5]->ne[1]; - ggml_vk_op_f32_rwkv6( + ggml_vk_op_f32_wkv( ctx, subctx, dst, { (uint32_t)n_seqs, @@ -6077,6 +6229,26 @@ static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, (uint32_t)n_embed, (uint32_t)n_heads, }, + 6, + dryrun + ); +} + +static void ggml_vk_rwkv_wkv7(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { + const size_t seq_length = dst->src[0]->ne[2]; + const size_t n_embed = dst->ne[0]; + const size_t n_heads = dst->src[0]->ne[1]; + const size_t n_seqs = dst->src[6]->ne[1]; + + ggml_vk_op_f32_wkv( + ctx, subctx, dst, + { + (uint32_t)n_seqs, + (uint32_t)seq_length, + (uint32_t)n_embed, + (uint32_t)n_heads, + }, + 7, dryrun ); } @@ -6378,6 +6550,11 @@ static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& sub ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM_BACK, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun); } +static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + float * op_params = (float *)dst->op_params; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_L2_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun); +} + static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); } @@ -6767,7 +6944,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, - split_k, batch, batch, batch, 1, 1 + split_k, batch, batch, batch, 1, 1, n ); } ggml_vk_ctx_end(subctx); @@ -7112,7 +7289,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, k*m, k*n, m*n, - split_k, batch, batch, batch, 1, 1 + split_k, batch, batch, batch, 1, 1, n ); } ggml_vk_ctx_end(subctx); @@ -7373,6 +7550,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: + case GGML_OP_L2_NORM: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: @@ -7389,6 +7567,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: case GGML_OP_RWKV_WKV6: + case GGML_OP_RWKV_WKV7: case GGML_OP_LEAKY_RELU: case GGML_OP_FLASH_ATTN_EXT: case GGML_OP_OPT_STEP_ADAMW: @@ -7435,6 +7614,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: + case GGML_OP_L2_NORM: case GGML_OP_UNARY: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: @@ -7552,6 +7732,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_RMS_NORM_BACK: ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun); + break; + case GGML_OP_L2_NORM: + ggml_vk_l2_norm(ctx, compute_ctx, src0, node, dryrun); + break; case GGML_OP_UNARY: switch (ggml_get_unary_op(node)) { @@ -7642,6 +7826,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod break; + case GGML_OP_RWKV_WKV7: + ggml_vk_rwkv_wkv7(ctx, compute_ctx, node, dryrun); + + break; + case GGML_OP_OPT_STEP_ADAMW: ggml_vk_opt_step_adamw(ctx, compute_ctx, node, dryrun); @@ -7715,6 +7904,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: + case GGML_OP_L2_NORM: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: @@ -7734,6 +7924,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: case GGML_OP_RWKV_WKV6: + case GGML_OP_RWKV_WKV7: case GGML_OP_LEAKY_RELU: case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: @@ -8651,6 +8842,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_NORM: case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: + case GGML_OP_L2_NORM: return ggml_is_contiguous(op->src[0]); case GGML_OP_ADD: case GGML_OP_SUB: @@ -8680,6 +8872,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: case GGML_OP_RWKV_WKV6: + case GGML_OP_RWKV_WKV7: case GGML_OP_LEAKY_RELU: case GGML_OP_OPT_STEP_ADAMW: return true; @@ -8826,7 +9019,7 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve UNUSED(instance_extensions); } -static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) { +static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: // Intel drivers don't support coopmat properly yet @@ -8834,10 +9027,7 @@ static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDevicePrope case VK_VENDOR_ID_AMD: if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { // Workaround for AMD proprietary driver reporting support on all GPUs - const std::string name = props.deviceName; - return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs - name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs - name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs + return arch == vk_device_architecture::AMD_RDNA3; } return true; default: @@ -9067,6 +9257,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps); } else if (tensor->op == GGML_OP_SILU_BACK) { tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]); + } else if (tensor->op == GGML_OP_L2_NORM) { + const float eps = ((float *) tensor->op_params)[0]; + tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps); } else if (tensor->op == GGML_OP_SOFT_MAX) { if (src1 != nullptr) { tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]); @@ -9186,6 +9379,9 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { } else if (tensor->op == GGML_OP_RWKV_WKV6) { tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], src_clone[4], src_clone[5]); + } else if (tensor->op == GGML_OP_RWKV_WKV7) { + tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], + src_clone[4], src_clone[5], src_clone[6]); } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) { src_clone[0]->flags = src0->flags; tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1], diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index 074031087..51c78b7d2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -1,8 +1,4 @@ find_package (Threads REQUIRED) -find_program(GLSLC_EXECUTABLE glslc) -if(NOT GLSLC_EXECUTABLE) - message(FATAL_ERROR "glslc not found.") -endif() set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp index 4ccbe613a..8efe4653f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -178,7 +178,7 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2 uvec4 v = bl128.block.q4k[0]; - const f16vec2 loadd = unpackFloat2x16(v.x); + const vec2 loadd = vec2(unpackFloat2x16(v.x)); uint32_t sc; uint32_t mbyte; @@ -199,15 +199,15 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2 sc &= 0x3F; mbyte &= 0x3F; - const float16_t d = loadd.x * float16_t(sc); - const float16_t m = loadd.y * float16_t(mbyte); + const float d = loadd.x * float(sc); + const float m = loadd.y * float(mbyte); uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]); qs = (qs >> (b * 4 + 8 * (idx & 1))) & 0xF; - float16_t ret = d * float16_t(qs) - m; + float ret = d * float(qs) - m; - return ret; + return float16_t(ret); } layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp new file mode 100644 index 000000000..deba8c398 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp @@ -0,0 +1,41 @@ +#version 450 + +#include "generic_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable +#define BLOCK_SIZE 512 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; + +shared FLOAT_TYPE sum[BLOCK_SIZE]; + +void main() { + const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; + const uint tid = gl_LocalInvocationID.x; + + sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp + + [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { + const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]); + sum[tid] += xi * xi; + } + + // sum up partial sums and write back result + barrier(); + [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + sum[tid] += sum[tid + s]; + } + barrier(); + } + + const FLOAT_TYPE scale = inversesqrt(max(sum[0], FLOAT_TYPE(p.param1))); + + [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { + data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col])); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp index 66dd2c860..7649febb0 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -23,6 +23,10 @@ layout (constant_id = 1) const uint BM = 64; layout (constant_id = 2) const uint BN = 64; layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant +layout (constant_id = 4) const bool enable_smaller_matrices = false; +const uint BNover2 = enable_smaller_matrices ? (BN / 2) : BN; +const uint BNover4 = enable_smaller_matrices ? (BN / 4) : BN; + layout (push_constant) uniform parameter { uint M; @@ -48,6 +52,8 @@ layout (push_constant) uniform parameter uint broadcast2; uint broadcast3; #endif + // N dimension for the B matrix can be >= p.N + uint padded_N; } p; @@ -166,15 +172,13 @@ void main() { const uint end_k = min(p.K, (ik + 1) * p.k_split); #endif - coopmat sum; - sum = coopmat(0.0); - #ifdef MUL_MAT_ID uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K; uint pos_b = 0; #else uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K; uint pos_b = batch_idx * p.batch_stride_b; + uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; #endif uint stride_a = p.stride_a / QUANT_K; @@ -195,6 +199,7 @@ void main() { tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2); tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1); #if QUANT_K > 1 tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K); @@ -202,18 +207,19 @@ void main() { #endif // Use end_k rather than p.K as the dimension because that's what - // we need to bound check against when using split_k + // we need to bound check against when using split_k. + // Bounds check B against padded_N, but bounds check D against N. tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k); - tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.N, end_k); + tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.padded_N, end_k); tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M); tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k); - tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.N, end_k); + tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.padded_N, end_k); tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0); #if !defined(MUL_MAT_ID) // Detect a fast path where all loads are entirely in bounds and no clamping is required - if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.N && (start_k % BK) == 0 && (end_k % BK) == 0 && + if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.padded_N && (start_k % BK) == 0 && (end_k % BK) == 0 && #if QUANT_K == 1 (stride_a % 8) == 0 && #endif @@ -229,16 +235,54 @@ void main() { tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); uint k_iters = (end_k - start_k + BK - 1) / BK; + if (enable_smaller_matrices && ic * BN + BNover4 >= p.N) { + coopmat sum = coopmat(0.0); + for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { - for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { + coopmat mat_a; + coopmat mat_b; - coopmat mat_a; - coopmat mat_b; + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose); - coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); - coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose); + sum = coopMatMulAdd(mat_a, mat_b, sum); + } + coopmat mat_d = coopmat(sum); - sum = coopMatMulAdd(mat_a, mat_b, sum); + coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BNover4, ir * BM, BM), tensorViewTranspose); + return; + } else if (enable_smaller_matrices && ic * BN + BNover2 >= p.N) { + coopmat sum = coopmat(0.0); + for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { + + coopmat mat_a; + coopmat mat_b; + + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose); + + sum = coopMatMulAdd(mat_a, mat_b, sum); + } + coopmat mat_d = coopmat(sum); + + coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BNover2, ir * BM, BM), tensorViewTranspose); + return; + } else { + coopmat sum = coopmat(0.0); + for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { + + coopmat mat_a; + coopmat mat_b; + + coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); + coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose); + + sum = coopMatMulAdd(mat_a, mat_b, sum); + } + coopmat mat_d = coopmat(sum); + + coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose); + return; } } else #endif // !defined(MUL_MAT_ID) @@ -251,6 +295,9 @@ void main() { tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1); + coopmat sum; + sum = coopmat(0.0); + [[dont_unroll]] for (uint block_k = start_k; block_k < end_k; block_k += BK) { @@ -263,7 +310,7 @@ void main() { #ifdef MUL_MAT_ID bool unclampedB = true; #else - bool unclampedB = (ic + 1) * BN <= p.N && block_k + BK <= end_k && (block_k % 8) == 0; + bool unclampedB = (ic + 1) * BN <= p.padded_N && block_k + BK <= end_k && (block_k % 8) == 0; #endif if (unclampedA && unclampedB) { coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); @@ -293,19 +340,16 @@ void main() { sum = coopMatMulAdd(mat_a, mat_b, sum); } } - } - // Convert from ACC_TYPE to D_TYPE - coopmat mat_d; - mat_d = coopmat(sum); + // Convert from ACC_TYPE to D_TYPE + coopmat mat_d; + mat_d = coopmat(sum); #ifdef MUL_MAT_ID - // Call callback to store each element, remapping row through shared memory - coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic); + // Call callback to store each element, remapping row through shared memory + coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic); #else - tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1); - - uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; - coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose); + coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose); #endif + } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index ee1fec4e1..eb2ad63ff 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -434,6 +434,7 @@ void process_shaders() { string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}); @@ -528,6 +529,8 @@ void process_shaders() { string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); + string_to_spv("rwkv_wkv7_f32", "wkv7.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); + string_to_spv("opt_step_adamw_f32", "opt_step_adamw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); for (auto &c : compiles) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp b/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp new file mode 100644 index 000000000..88c1c02b3 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/wkv7.comp @@ -0,0 +1,91 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : require + +#define BLOCK_SIZE 64 +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform Parameters { + uint B; + uint T; + uint C; + uint H; +}; + +layout(binding = 0) readonly buffer RBuf { A_TYPE r[]; }; +layout(binding = 1) readonly buffer WBuf { A_TYPE w[]; }; +layout(binding = 2) readonly buffer KBuf { A_TYPE k[]; }; +layout(binding = 3) readonly buffer VBuf { A_TYPE v[]; }; +layout(binding = 4) readonly buffer ABuf { A_TYPE a[]; }; +layout(binding = 5) readonly buffer BBuf { A_TYPE b[]; }; +layout(binding = 6) readonly buffer StateBuf { A_TYPE state_in[]; }; +layout(binding = 7) buffer DstBuf { A_TYPE dst[]; }; + +shared A_TYPE _r[BLOCK_SIZE], _w[BLOCK_SIZE], _k[BLOCK_SIZE], _a[BLOCK_SIZE], _b[BLOCK_SIZE]; + +void main() { + const uint head_size = BLOCK_SIZE; + const uint batch_id = gl_WorkGroupID.x / H; + const uint head_id = gl_WorkGroupID.x % H; + const uint tid = gl_LocalInvocationID.x; + + const uint state_size = C * head_size; + const uint n_seq_tokens = T / B; + + if (batch_id >= B || head_id >= H) { + return; + } + + A_TYPE state[BLOCK_SIZE]; + [[unroll]] for (uint i = 0; i < head_size; i++) { + state[i] = state_in[batch_id * state_size + head_id * head_size * head_size + + tid * head_size + i]; + } + + const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid; + const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid; + + for (uint t = start_t; t < end_t; t += C) { + barrier(); + _r[tid] = r[t]; + _w[tid] = w[t]; + _k[tid] = k[t]; + _a[tid] = a[t]; + _b[tid] = b[t]; + barrier(); + + A_TYPE sa = 0.0; + [[unroll]] for (uint j = 0; j < head_size; j += 4) { + vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]); + vec4 a_vec = vec4(_a[j], _a[j+1], _a[j+2], _a[j+3]); + sa += dot(s_vec, a_vec); + } + + const A_TYPE v_val = v[t]; + A_TYPE y = 0.0; + + [[unroll]] for (uint j = 0; j < head_size; j += 4) { + vec4 r_vec = vec4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + vec4 w_vec = vec4(_w[j], _w[j+1], _w[j+2], _w[j+3]); + vec4 k_vec = vec4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + vec4 b_vec = vec4(_b[j], _b[j+1], _b[j+2], _b[j+3]); + vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]); + + vec4 kv = k_vec * v_val; + s_vec = s_vec * w_vec + kv + sa * b_vec; + y += dot(r_vec, s_vec); + + state[j] = s_vec.x; + state[j+1] = s_vec.y; + state[j+2] = s_vec.z; + state[j+3] = s_vec.w; + } + + dst[t] = y; + } + + [[unroll]] for (uint i = 0; i < head_size; i++) { + dst[T * C + batch_id * state_size + head_id * head_size * head_size + + tid * head_size + i] = state[i]; + } +} diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 89409bb0e..2e081d591 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -929,6 +929,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "RMS_NORM", "RMS_NORM_BACK", "GROUP_NORM", + "L2_NORM", "MUL_MAT", "MUL_MAT_ID", @@ -977,6 +978,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ADD_REL_POS", "RWKV_WKV6", "GATED_LINEAR_ATTN", + "RWKV_WKV7", "UNARY", @@ -996,7 +998,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "OPT_STEP_ADAMW", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1026,6 +1028,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rms_norm(x)", "rms_norm_back(x)", "group_norm(x)", + "l2_norm(x)", "X*Y", "X[i]*Y", @@ -1074,6 +1077,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "add_rel_pos(x)", "rwkv_wkv6(k, v, r, tf, td, s)", "gated_linear_attn(k, v, q, gate, s)", + "rwkv_wkv7(r, w, k, v, a, b, s)", "unary(x)", @@ -1093,7 +1097,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "adamw(x)", }; -static_assert(GGML_OP_COUNT == 83, "GGML_OP_COUNT != 83"); +static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -2686,6 +2690,37 @@ struct ggml_tensor * ggml_group_norm_inplace( return ggml_group_norm_impl(ctx, a, n_groups, eps, true); } +// ggml_l2_norm + +static struct ggml_tensor * ggml_l2_norm_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps, + bool inplace) { + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + ggml_set_op_params_f32(result, 0, eps); + + result->op = GGML_OP_L2_NORM; + result->src[0] = a; + + return result; +} + +struct ggml_tensor * ggml_l2_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_l2_norm_impl(ctx, a, eps, false); +} + +struct ggml_tensor * ggml_l2_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps) { + return ggml_l2_norm_impl(ctx, a, eps, true); +} + // ggml_mul_mat static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { @@ -4720,6 +4755,54 @@ struct ggml_tensor * ggml_gated_linear_attn( return result; } +// ggml_rwkv_wkv7 + +struct ggml_tensor * ggml_rwkv_wkv7( + struct ggml_context * ctx, + struct ggml_tensor * r, + struct ggml_tensor * w, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * state) { + GGML_ASSERT(ggml_is_contiguous(r)); + GGML_ASSERT(ggml_is_contiguous(w)); + GGML_ASSERT(ggml_is_contiguous(k)); + GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(a)); + GGML_ASSERT(ggml_is_contiguous(b)); + GGML_ASSERT(ggml_is_contiguous(state)); + + const int64_t S = k->ne[0]; + const int64_t H = k->ne[1]; + const int64_t n_tokens = k->ne[2]; + const int64_t n_seqs = state->ne[1]; + { + GGML_ASSERT(w->ne[0] == S && w->ne[1] == H && w->ne[2] == n_tokens); + GGML_ASSERT(k->ne[0] == S && k->ne[1] == H && k->ne[2] == n_tokens); + GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens); + GGML_ASSERT(a->ne[0] == S && a->ne[1] == H && a->ne[2] == n_tokens); + GGML_ASSERT(b->ne[0] == S && b->ne[1] == H && b->ne[2] == n_tokens); + GGML_ASSERT(ggml_nelements(state) == S * S * H * n_seqs); + } + + // concat output and new_state + const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_RWKV_WKV7; + result->src[0] = r; + result->src[1] = w; + result->src[2] = k; + result->src[3] = v; + result->src[4] = a; + result->src[5] = b; + result->src[6] = state; + + return result; +} + // ggml_unary static struct ggml_tensor * ggml_unary_impl( diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 19624eae0..cc48913d9 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -118,22 +118,26 @@ class Keys: TOKEN_SHIFT_COUNT = "{arch}.token_shift_count" class Attention: - HEAD_COUNT = "{arch}.attention.head_count" - HEAD_COUNT_KV = "{arch}.attention.head_count_kv" - MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" - CLAMP_KQV = "{arch}.attention.clamp_kqv" - KEY_LENGTH = "{arch}.attention.key_length" - VALUE_LENGTH = "{arch}.attention.value_length" - LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" - LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" - GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" - GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" - CAUSAL = "{arch}.attention.causal" - Q_LORA_RANK = "{arch}.attention.q_lora_rank" - KV_LORA_RANK = "{arch}.attention.kv_lora_rank" - REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" - SLIDING_WINDOW = "{arch}.attention.sliding_window" - SCALE = "{arch}.attention.scale" + HEAD_COUNT = "{arch}.attention.head_count" + HEAD_COUNT_KV = "{arch}.attention.head_count_kv" + MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" + CLAMP_KQV = "{arch}.attention.clamp_kqv" + KEY_LENGTH = "{arch}.attention.key_length" + VALUE_LENGTH = "{arch}.attention.value_length" + LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" + LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" + GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" + CAUSAL = "{arch}.attention.causal" + Q_LORA_RANK = "{arch}.attention.q_lora_rank" + KV_LORA_RANK = "{arch}.attention.kv_lora_rank" + DECAY_LORA_RANK = "{arch}.attention.decay_lora_rank" + ICLR_LORA_RANK = "{arch}.attention.iclr_lora_rank" + VALUE_RESIDUAL_MIX_LORA_RANK = "{arch}.attention.value_residual_mix_lora_rank" + GATE_LORA_RANK = "{arch}.attention.gate_lora_rank" + REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" + SLIDING_WINDOW = "{arch}.attention.sliding_window" + SCALE = "{arch}.attention.scale" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -257,6 +261,8 @@ class MODEL_ARCH(IntEnum): STARCODER2 = auto() RWKV6 = auto() RWKV6QWEN2 = auto() + RWKV7 = auto() + ARWKV7 = auto() MAMBA = auto() XVERSE = auto() COMMAND_R = auto() @@ -329,8 +335,20 @@ class MODEL_TENSOR(IntEnum): SSM_A = auto() SSM_D = auto() SSM_OUT = auto() + TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() + TIME_MIX_A0 = auto() + TIME_MIX_A1 = auto() + TIME_MIX_A2 = auto() + TIME_MIX_V0 = auto() + TIME_MIX_V1 = auto() + TIME_MIX_V2 = auto() + TIME_MIX_G1 = auto() + TIME_MIX_G2 = auto() + TIME_MIX_K_K = auto() + TIME_MIX_K_A = auto() + TIME_MIX_R_K = auto() TIME_MIX_LERP_X = auto() TIME_MIX_LERP_K = auto() TIME_MIX_LERP_V = auto() @@ -445,6 +463,8 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2", + MODEL_ARCH.RWKV7: "rwkv7", + MODEL_ARCH.ARWKV7: "arwkv7", MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", @@ -517,8 +537,20 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", + MODEL_TENSOR.TIME_MIX_A0: "blk.{bid}.time_mix_a0", + MODEL_TENSOR.TIME_MIX_A1: "blk.{bid}.time_mix_a1", + MODEL_TENSOR.TIME_MIX_A2: "blk.{bid}.time_mix_a2", + MODEL_TENSOR.TIME_MIX_V0: "blk.{bid}.time_mix_v0", + MODEL_TENSOR.TIME_MIX_V1: "blk.{bid}.time_mix_v1", + MODEL_TENSOR.TIME_MIX_V2: "blk.{bid}.time_mix_v2", + MODEL_TENSOR.TIME_MIX_G1: "blk.{bid}.time_mix_g1", + MODEL_TENSOR.TIME_MIX_G2: "blk.{bid}.time_mix_g2", + MODEL_TENSOR.TIME_MIX_K_K: "blk.{bid}.time_mix_k_k", + MODEL_TENSOR.TIME_MIX_K_A: "blk.{bid}.time_mix_k_a", + MODEL_TENSOR.TIME_MIX_R_K: "blk.{bid}.time_mix_r_k", MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x", MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k", MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v", @@ -1172,6 +1204,68 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.RWKV7: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, + MODEL_TENSOR.TIME_MIX_W0, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_A0, + MODEL_TENSOR.TIME_MIX_A1, + MODEL_TENSOR.TIME_MIX_A2, + MODEL_TENSOR.TIME_MIX_V0, + MODEL_TENSOR.TIME_MIX_V1, + MODEL_TENSOR.TIME_MIX_V2, + MODEL_TENSOR.TIME_MIX_G1, + MODEL_TENSOR.TIME_MIX_G2, + MODEL_TENSOR.TIME_MIX_K_K, + MODEL_TENSOR.TIME_MIX_K_A, + MODEL_TENSOR.TIME_MIX_R_K, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.CHANNEL_MIX_LERP_K, + MODEL_TENSOR.CHANNEL_MIX_KEY, + MODEL_TENSOR.CHANNEL_MIX_VALUE, + ], + MODEL_ARCH.ARWKV7: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.TIME_MIX_LERP_FUSED, + MODEL_TENSOR.TIME_MIX_W0, + MODEL_TENSOR.TIME_MIX_W1, + MODEL_TENSOR.TIME_MIX_W2, + MODEL_TENSOR.TIME_MIX_A0, + MODEL_TENSOR.TIME_MIX_A1, + MODEL_TENSOR.TIME_MIX_A2, + MODEL_TENSOR.TIME_MIX_V0, + MODEL_TENSOR.TIME_MIX_V1, + MODEL_TENSOR.TIME_MIX_V2, + MODEL_TENSOR.TIME_MIX_G1, + MODEL_TENSOR.TIME_MIX_G2, + MODEL_TENSOR.TIME_MIX_K_K, + MODEL_TENSOR.TIME_MIX_K_A, + MODEL_TENSOR.TIME_MIX_R_K, + MODEL_TENSOR.TIME_MIX_KEY, + MODEL_TENSOR.TIME_MIX_VALUE, + MODEL_TENSOR.TIME_MIX_RECEPTANCE, + MODEL_TENSOR.TIME_MIX_LN, + MODEL_TENSOR.TIME_MIX_OUTPUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.MAMBA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 080d2b9dc..af8b388df 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -767,6 +767,18 @@ class GGUFWriter: def add_kv_lora_rank(self, length: int) -> None: self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length) + def add_decay_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.DECAY_LORA_RANK.format(arch=self.arch), length) + + def add_iclr_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.ICLR_LORA_RANK.format(arch=self.arch), length) + + def add_value_residual_mix_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.VALUE_RESIDUAL_MIX_LORA_RANK.format(arch=self.arch), length) + + def add_gate_lora_rank(self, length: int) -> None: + self.add_uint32(Keys.Attention.GATE_LORA_RANK.format(arch=self.arch), length) + def add_relative_attn_buckets_count(self, value: int) -> None: self.add_uint32(Keys.Attention.REL_BUCKETS_COUNT.format(arch=self.arch), value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 617791e24..8d4a2b032 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -27,7 +27,8 @@ class TensorNameMap: "embedding.word_embeddings", # chatglm "transformer.token_embeddings", # openelm "shared", # t5 - "rwkv.embeddings", # rwkv + "rwkv.embeddings", # rwkv6 + "model.embeddings", # rwkv7 ), # Token type embeddings @@ -42,6 +43,9 @@ class TensorNameMap: "emb_ln", # nomic-bert "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv + "rwkv.blocks.0.pre_ln", # rwkv6 + "model.pre_ln", # rwkv7 + "model.layers.0.pre_norm", # rwkv7 "backbone.norm", # wavtokenizer ), @@ -81,7 +85,8 @@ class TensorNameMap: "encoder.final_layernorm", # chatglm "transformer.norm", # openelm "model.norm", # nemotron - "rwkv.ln_out", # rwkv + "rwkv.ln_out", # rwkv6 + "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer ), @@ -122,14 +127,16 @@ class TensorNameMap: "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "encoder.layers.{bid}.input_layernorm", # chatglm "transformer.layers.{bid}.attn_norm", # openelm - "rwkv.blocks.{bid}.ln1", # rwkv + "rwkv.blocks.{bid}.ln1", # rwkv6 + "model.layers.{bid}.ln1", # rwkv7 ), # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( "transformer.h.{bid}.ln_attn", # falcon40b "encoder.layer.{bid}.layer_norm_1", # jina-v2-code - "rwkv.blocks.{bid}.ln2", # rwkv + "rwkv.blocks.{bid}.ln2", # rwkv6 + "model.layers.{bid}.ln2", # rwkv7 ), # Attention query-key-value @@ -462,112 +469,174 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.out_proj", ), + MODEL_TENSOR.TIME_MIX_W0: ( + "model.layers.{bid}.attention.w0", # rwkv7 + ), + MODEL_TENSOR.TIME_MIX_W1: ( - "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6 - "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_w1", # rwkv6qwen2 + "model.layers.{bid}.attention.w1", # rwkv7 ), MODEL_TENSOR.TIME_MIX_W2: ( - "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6 - "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv6 + "model.layers.{bid}.self_attn.time_maa_w2", # rwkv6qwen2 + "model.layers.{bid}.attention.w2", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_A0: ( + "model.layers.{bid}.attention.a0", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_A1: ( + "model.layers.{bid}.attention.a1", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_A2: ( + "model.layers.{bid}.attention.a2", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_V0: ( + "model.layers.{bid}.attention.v0", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_V1: ( + "model.layers.{bid}.attention.v1", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_V2: ( + "model.layers.{bid}.attention.v2", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_G1: ( + "model.layers.{bid}.attention.g1", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_G2: ( + "model.layers.{bid}.attention.g2", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_K_K: ( + "model.layers.{bid}.attention.k_k", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_K_A: ( + "model.layers.{bid}.attention.k_a", # rwkv7 + ), + + MODEL_TENSOR.TIME_MIX_R_K: ( + "model.layers.{bid}.attention.r_k", # rwkv7 ), MODEL_TENSOR.TIME_MIX_LERP_X: ( - "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv6 "model.layers.{bid}.self_attn.time_maa_x", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_K: ( - "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv6 "model.layers.{bid}.self_attn.time_maa_k", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_V: ( - "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv6 "model.layers.{bid}.self_attn.time_maa_v", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_R: ( - "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv6 "model.layers.{bid}.self_attn.time_maa_r", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_G: ( - "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv6 "model.layers.{bid}.self_attn.time_maa_g", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LERP_W: ( - "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv6 "model.layers.{bid}.self_attn.time_maa_w", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_FIRST: ( - "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv6 ), MODEL_TENSOR.TIME_MIX_DECAY: ( - "rwkv.blocks.{bid}.attention.time_decay", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_decay", # rwkv6 "model.layers.{bid}.self_attn.time_decay", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_DECAY_W1: ( - "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv6 "model.layers.{bid}.self_attn.time_decay_w1", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_DECAY_W2: ( - "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6 + "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv6 "model.layers.{bid}.self_attn.time_decay_w2", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_KEY: ( - "rwkv.blocks.{bid}.attention.key", # rwkv + "rwkv.blocks.{bid}.attention.key", # rwkv6 "model.layers.{bid}.self_attn.k_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.key", # rwkv7 + "model.layers.{bid}.attention.k_proj", # rwkv7 ), MODEL_TENSOR.TIME_MIX_VALUE: ( - "rwkv.blocks.{bid}.attention.value", # rwkv + "rwkv.blocks.{bid}.attention.value", # rwkv6 "model.layers.{bid}.self_attn.v_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.value", # rwkv7 + "model.layers.{bid}.attention.v_proj", # rwkv7 ), MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( - "rwkv.blocks.{bid}.attention.receptance", # rwkv - "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.receptance", # rwkv6 + "model.layers.{bid}.self_attn.q_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.receptance", # rwkv7 + "model.layers.{bid}.attention.r_proj", # rwkv7 ), MODEL_TENSOR.TIME_MIX_GATE: ( - "rwkv.blocks.{bid}.attention.gate", # rwkv - "model.layers.{bid}.self_attn.gate", # rwkv6qwen2 + "rwkv.blocks.{bid}.attention.gate", # rwkv6 + "model.layers.{bid}.self_attn.gate", # rwkv6qwen2 ), MODEL_TENSOR.TIME_MIX_LN: ( - "rwkv.blocks.{bid}.attention.ln_x", # rwkv + "rwkv.blocks.{bid}.attention.ln_x", # rwkv6 + "model.layers.{bid}.attention.ln_x" # rwkv7 ), MODEL_TENSOR.TIME_MIX_OUTPUT: ( - "rwkv.blocks.{bid}.attention.output", # rwkv + "rwkv.blocks.{bid}.attention.output", # rwkv6 "model.layers.{bid}.self_attn.o_proj", # rwkv6qwen2 + "model.layers.{bid}.attention.output", # rwkv7 + "model.layers.{bid}.attention.o_proj", # rwkv7 ), MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( - "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6 + "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv6 + "model.layers.{bid}.feed_forward.x_k", # rwkv7 ), MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( - "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6 + "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv6 ), MODEL_TENSOR.CHANNEL_MIX_KEY: ( - "rwkv.blocks.{bid}.feed_forward.key", # rwkv + "rwkv.blocks.{bid}.feed_forward.key", # rwkv6 + "model.layers.{bid}.feed_forward.key", # rwkv7 ), MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( - "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv + "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv6 ), MODEL_TENSOR.CHANNEL_MIX_VALUE: ( - "rwkv.blocks.{bid}.feed_forward.value", # rwkv + "rwkv.blocks.{bid}.feed_forward.value", # rwkv6 + "model.layers.{bid}.feed_forward.value", # rwkv7 ), MODEL_TENSOR.ATTN_Q_A: ( diff --git a/include/llama.h b/include/llama.h index 28fb82606..73fecf029 100644 --- a/include/llama.h +++ b/include/llama.h @@ -1045,6 +1045,10 @@ extern "C" { // If set to true, the model will only attend to the past tokens LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); + // Set whether the model is in warmup mode or not + // If true, all model tensors are activated during llama_decode() to load and cache their weights. + LLAMA_API void llama_set_warmup(struct llama_context * ctx, bool warmup); + // Set abort callback LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 28f2bbc8f..9debb56cc 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -59,6 +59,8 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_EXAONE, "exaone" }, { LLM_ARCH_RWKV6, "rwkv6" }, { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" }, + { LLM_ARCH_RWKV7, "rwkv7" }, + { LLM_ARCH_ARWKV7, "arwkv7" }, { LLM_ARCH_GRANITE, "granite" }, { LLM_ARCH_GRANITE_MOE, "granitemoe" }, { LLM_ARCH_CHAMELEON, "chameleon" }, @@ -110,22 +112,26 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, - { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, - { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, - { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, - { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, - { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" }, - { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, - { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, - { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, - { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" }, - { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" }, - { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, - { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" }, - { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, - { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, - { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, - { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, + { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, + { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, + { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, + { LLM_KV_ATTENTION_CLAMP_KQV, "%s.attention.clamp_kqv" }, + { LLM_KV_ATTENTION_KEY_LENGTH, "%s.attention.key_length" }, + { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, + { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, + { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, + { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" }, + { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" }, + { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, + { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" }, + { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, + { LLM_KV_ATTENTION_DECAY_LORA_RANK, "%s.attention.decay_lora_rank" }, + { LLM_KV_ATTENTION_ICLR_LORA_RANK, "%s.attention.iclr_lora_rank" }, + { LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, "%s.attention.value_residual_mix_lora_rank" }, + { LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" }, + { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, + { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, + { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, @@ -1238,6 +1244,74 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_RWKV7, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" }, + { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" }, + { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, + { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, + { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" }, + { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" }, + { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" }, + { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" }, + { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" }, + { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" }, + { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" }, + { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" }, + { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" }, + { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" }, + { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" }, + { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, + { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, + { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, + { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, + { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" }, + { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, + { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" }, + { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" }, + { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" }, + }, + }, + { + LLM_ARCH_ARWKV7, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_TIME_MIX_W0, "blk.%d.time_mix_w0" }, + { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" }, + { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" }, + { LLM_TENSOR_TIME_MIX_A0, "blk.%d.time_mix_a0" }, + { LLM_TENSOR_TIME_MIX_A1, "blk.%d.time_mix_a1" }, + { LLM_TENSOR_TIME_MIX_A2, "blk.%d.time_mix_a2" }, + { LLM_TENSOR_TIME_MIX_V0, "blk.%d.time_mix_v0" }, + { LLM_TENSOR_TIME_MIX_V1, "blk.%d.time_mix_v1" }, + { LLM_TENSOR_TIME_MIX_V2, "blk.%d.time_mix_v2" }, + { LLM_TENSOR_TIME_MIX_G1, "blk.%d.time_mix_g1" }, + { LLM_TENSOR_TIME_MIX_G2, "blk.%d.time_mix_g2" }, + { LLM_TENSOR_TIME_MIX_K_K, "blk.%d.time_mix_k_k" }, + { LLM_TENSOR_TIME_MIX_K_A, "blk.%d.time_mix_k_a" }, + { LLM_TENSOR_TIME_MIX_R_K, "blk.%d.time_mix_r_k" }, + { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" }, + { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" }, + { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" }, + { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" }, + { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" }, + { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_GRANITE, { @@ -1397,6 +1471,12 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_TIME_MIX_A1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_TIME_MIX_A2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_TIME_MIX_V1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_TIME_MIX_V2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_TIME_MIX_G1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_TIME_MIX_G2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_DECAY_W1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_DECAY_W2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_KEY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, @@ -1415,6 +1495,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_R, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_TIME_MIX_K_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_TIME_MIX_K_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_TIME_MIX_R_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LERP_W, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, @@ -1422,6 +1505,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_TIME_MIX_LERP_G, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_LERP_FUSED, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_DECAY, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_TIME_MIX_W0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_TIME_MIX_A0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_TIME_MIX_V0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_TIME_MIX_FIRST, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_RWKV_WKV6}}, {LLM_TENSOR_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_NORM_2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 2ec2e2362..a28815d8a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -63,6 +63,8 @@ enum llm_arch { LLM_ARCH_EXAONE, LLM_ARCH_RWKV6, LLM_ARCH_RWKV6QWEN2, + LLM_ARCH_RWKV7, + LLM_ARCH_ARWKV7, LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, @@ -127,6 +129,10 @@ enum llm_kv { LLM_KV_ATTENTION_CAUSAL, LLM_KV_ATTENTION_Q_LORA_RANK, LLM_KV_ATTENTION_KV_LORA_RANK, + LLM_KV_ATTENTION_DECAY_LORA_RANK, + LLM_KV_ATTENTION_ICLR_LORA_RANK, + LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, + LLM_KV_ATTENTION_GATE_LORA_RANK, LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, @@ -250,8 +256,20 @@ enum llm_tensor { LLM_TENSOR_SSM_A, LLM_TENSOR_SSM_D, LLM_TENSOR_SSM_OUT, + LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, + LLM_TENSOR_TIME_MIX_A0, + LLM_TENSOR_TIME_MIX_A1, + LLM_TENSOR_TIME_MIX_A2, + LLM_TENSOR_TIME_MIX_V0, + LLM_TENSOR_TIME_MIX_V1, + LLM_TENSOR_TIME_MIX_V2, + LLM_TENSOR_TIME_MIX_G1, + LLM_TENSOR_TIME_MIX_G2, + LLM_TENSOR_TIME_MIX_K_K, + LLM_TENSOR_TIME_MIX_K_A, + LLM_TENSOR_TIME_MIX_R_K, LLM_TENSOR_TIME_MIX_LERP_X, LLM_TENSOR_TIME_MIX_LERP_W, LLM_TENSOR_TIME_MIX_LERP_K, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d89e1ac2c..c77195d2f 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -40,6 +40,7 @@ llama_context::llama_context( cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; cparams.pooling_type = params.pooling_type; + cparams.warmup = false; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -285,11 +286,15 @@ llama_context::llama_context( // reserve worst-case graph if (!hparams.vocab_only) { - uint32_t n_seqs = 1; // TODO: worst-case number of sequences - uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); + const uint32_t n_seqs = 1; // TODO: worst-case number of sequences + const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + // restore later + // TODO: something cleaner + const auto n_outputs_save = n_outputs; + // max number of outputs n_outputs = n_tokens; @@ -341,6 +346,8 @@ llama_context::llama_context( } } + n_outputs = n_outputs_save; + for (size_t i = 0; i < backend_ptrs.size(); ++i) { ggml_backend_t backend = backend_ptrs[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; @@ -443,10 +450,10 @@ ggml_tensor * llama_context::build_rope_shift( ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, + float freq_base, + float freq_scale, ggml_backend_buffer * bbuf) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; - const auto & freq_base = cparams.rope_freq_base; - const auto & freq_scale = cparams.rope_freq_scale; const auto & yarn_ext_factor = cparams.yarn_ext_factor; const auto & yarn_attn_factor = cparams.yarn_attn_factor; @@ -538,6 +545,13 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const bool is_swa = hparams.is_swa(il); + + // note: the swa rope params could become part of the cparams in the future + // if we decide to make them configurable, like the non-sliding ones + const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base; + const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale; + ggml_tensor * rope_factors = kv_self->cbs.get_rope_factors(n_ctx_per_seq(), il); ggml_tensor * k = @@ -547,7 +561,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift( ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), 0); - ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, kv_self->k_l[il]->buffer); + ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer); ggml_build_forward_expand(gf, cur); } @@ -942,6 +956,12 @@ void llama_context::set_causal_attn(bool value) { cparams.causal_attn = value; } +void llama_context::set_warmup(bool value) { + LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); + + cparams.warmup = value; +} + void llama_context::set_adapter_lora( llama_adapter_lora * adapter, float scale) { @@ -1048,6 +1068,13 @@ int llama_context::encode(llama_batch_ext & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); + const auto causal_attn_org = cparams.causal_attn; + + // always use non-causal attention for encoder graphs + // TODO: this is a tmp solution until we have a proper way to support enc-dec models + // ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223 + cparams.causal_attn = false; + auto * gf = graph_init(); auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER); @@ -1055,6 +1082,8 @@ int llama_context::encode(llama_batch_ext & inp_batch) { res->set_inputs(&ubatch); + cparams.causal_attn = causal_attn_org; + const auto compute_status = graph_compute(gf, n_tokens > 1); switch (compute_status) { case GGML_STATUS_SUCCESS: @@ -1594,7 +1623,7 @@ void llama_context::output_reorder() { // int32_t llama_context::graph_max_nodes() const { - return std::max(8192, 5*model.n_tensors()); + return std::max(65536, 5*model.n_tensors()); } ggml_cgraph * llama_context::graph_init() { @@ -2372,6 +2401,10 @@ void llama_set_causal_attn(llama_context * ctx, bool causal_attn) { ctx->set_causal_attn(causal_attn); } +void llama_set_warmup(llama_context * ctx, bool warmup) { + ctx->set_warmup(warmup); +} + void llama_synchronize(llama_context * ctx) { ctx->synchronize(); } diff --git a/src/llama-context.h b/src/llama-context.h index 29bb230f1..fff1b9e53 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -64,6 +64,7 @@ struct llama_context { void set_embeddings (bool value); void set_causal_attn(bool value); + void set_warmup(bool value); void set_adapter_lora( llama_adapter_lora * adapter, @@ -172,6 +173,8 @@ private: ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, + float freq_base, + float freq_scale, ggml_backend_buffer * bbuf) const; llm_graph_result_ptr build_kv_self_shift( diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 252012f3d..30e550f02 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -29,6 +29,7 @@ struct llama_cparams { bool offload_kqv; bool flash_attn; bool no_perf; + bool warmup; enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 1e3f2efc8..4e9087339 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -577,7 +577,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : n_embd_head_v (hparams.n_embd_head_v), n_embd_v_gqa (hparams.n_embd_v_gqa()), n_expert (hparams.n_expert), - n_expert_used (hparams.n_expert_used), + n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used), freq_base (cparams.rope_freq_base), freq_scale (cparams.rope_freq_scale), ext_factor (cparams.yarn_ext_factor), @@ -1311,29 +1311,23 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } -llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified( - bool causal, - bool swa) const { +llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() const { const llama_kv_cache_unified * kv_self = static_cast(memory); auto inp = std::make_unique(hparams, cparams, kv_self); const auto n_kv = kv_self->n; - inp->self_kq_mask = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) - : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + inp->self_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); //cb(inp->self_kq_mask, "KQ_mask", -1); ggml_set_input(inp->self_kq_mask); inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; - if (swa) { + if (hparams.n_swa_pattern > 1) { GGML_ASSERT(hparams.n_swa > 0); - inp->self_kq_mask_swa = causal - ? ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)) - : ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); + inp->self_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); //cb(inp->self_kq_mask_swa, "KQ_mask_swa", -1); ggml_set_input(inp->self_kq_mask_swa); @@ -1403,36 +1397,9 @@ ggml_tensor * llm_graph_context::build_attn( ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view)); } - // TODO: improve - bool is_sliding = false; + const bool is_swa = hparams.is_swa(il); - switch (arch) { - case LLM_ARCH_COHERE2: - { - const int32_t sliding_window_pattern = 4; - is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); - } break; - case LLM_ARCH_GEMMA2: - { - const int32_t sliding_window_pattern = 2; - is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); - } break; - case LLM_ARCH_GEMMA3: - { - const int32_t sliding_window_pattern = 6; - is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); - } break; - case LLM_ARCH_PHI3: - { - is_sliding = hparams.n_swa > 0; - } break; - default: - { - is_sliding = false; - } - }; - - const auto & kq_mask = is_sliding ? inp->get_kq_mask_swa() : inp->get_kq_mask(); + const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask(); const auto n_kv = kv_self->n; diff --git a/src/llama-graph.h b/src/llama-graph.h index b7a66d189..c4328e6f9 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -509,9 +509,7 @@ struct llm_graph_context { float kq_scale, int il) const; - llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified( - bool causal, - bool swa) const; + llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const; ggml_tensor * build_attn( llm_graph_input_attn_kv_unified * inp, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index ea87b2953..90dfe7a7f 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -69,3 +69,11 @@ uint32_t llama_hparams::n_embd_v_s() const { // corresponds to Mamba's ssm_states size return ssm_d_state * ssm_d_inner; } + +bool llama_hparams::is_swa(uint32_t il) const { + if (il < n_layer) { + return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); + } + + GGML_ABORT("fatal error"); +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 1fe454103..bb17ba86d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -36,6 +36,7 @@ struct llama_hparams { uint32_t n_layer; uint32_t n_rot; uint32_t n_swa = 0; // sliding window attention (SWA) + uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_expert = 0; @@ -75,10 +76,16 @@ struct llama_hparams { uint32_t time_decay_extra_dim = 0; uint32_t wkv_head_size = 0; uint32_t token_shift_count = 2; + uint32_t n_lora_decay = 0; + uint32_t n_lora_iclr = 0; + uint32_t n_lora_value_res_mix = 0; + uint32_t n_lora_gate = 0; float rope_attn_factor = 1.0f; float rope_freq_base_train; + float rope_freq_base_train_swa; float rope_freq_scale_train; + float rope_freq_scale_train_swa; uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul; @@ -133,6 +140,8 @@ struct llama_hparams { // dimension of the recurrent state embeddings uint32_t n_embd_v_s() const; + + bool is_swa(uint32_t il) const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 522219c01..c571aa69b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -32,6 +32,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_109M: return "109M"; case LLM_TYPE_137M: return "137M"; case LLM_TYPE_160M: return "160M"; + case LLM_TYPE_190M: return "190M"; case LLM_TYPE_220M: return "220M"; case LLM_TYPE_250M: return "250M"; case LLM_TYPE_270M: return "270M"; @@ -48,6 +49,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_1_6B: return "1.6B"; case LLM_TYPE_2B: return "2B"; case LLM_TYPE_2_8B: return "2.8B"; + case LLM_TYPE_2_9B: return "2.9B"; case LLM_TYPE_3B: return "3B"; case LLM_TYPE_4B: return "4B"; case LLM_TYPE_6B: return "6B"; @@ -475,6 +477,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { } hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; + // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); // non-transformer models do not have attention heads @@ -780,9 +786,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_swa = 2047; } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) { // default value for Phi-3-mini-128k-instruct + // note: this seems incorrect because the window is bigger than the train context? hparams.n_swa = 262144; } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) { // default value for Phi-3-medium-128k-instruct + // note: this seems incorrect because the window is equal to the train context? hparams.n_swa = 131072; } bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); @@ -858,11 +866,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_GEMMA2: { hparams.n_swa = 4096; // default value of gemma 2 + hparams.n_swa_pattern = 2; + hparams.attn_soft_cap = true; + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - hparams.attn_soft_cap = true; switch (hparams.n_layer) { case 26: type = LLM_TYPE_2B; break; @@ -873,6 +883,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_GEMMA3: { + hparams.n_swa_pattern = 6; + + hparams.rope_freq_base_train_swa = 10000.0f; + hparams.rope_freq_scale_train_swa = 1.0f; + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -952,6 +967,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_COHERE2: { + hparams.n_swa_pattern = 4; + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -990,6 +1007,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; + case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; } } break; @@ -1234,6 +1252,36 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_RWKV7: + case LLM_ARCH_ARWKV7: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); + ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size); + ml.get_key(LLM_KV_ATTENTION_DECAY_LORA_RANK, hparams.n_lora_decay); + ml.get_key(LLM_KV_ATTENTION_ICLR_LORA_RANK, hparams.n_lora_iclr); + ml.get_key(LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK, hparams.n_lora_value_res_mix); + ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); + ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); + + switch (hparams.n_layer) { + case 12: type = LLM_TYPE_190M; break; + case 24: + switch (hparams.n_embd) { + case 1024: type = LLM_TYPE_450M; break; + case 2048: type = LLM_TYPE_1_5B; break; + default: type = LLM_TYPE_UNKNOWN; + } break; + case 28: + switch (hparams.n_embd) { + case 1536: type = LLM_TYPE_1_5B; break; + case 3584: type = LLM_TYPE_7B; break; + default: type = LLM_TYPE_UNKNOWN; + } break; + case 32: type = LLM_TYPE_2_9B; break; // RWKV-7-World + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_GRANITE: case LLM_ARCH_GRANITE_MOE: { @@ -1340,13 +1388,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0); const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { + const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { - LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(cpu_dev)); + LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; } const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin(); auto * dev = devices.at(layer_gpu); - LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s\n", il, ggml_backend_dev_name(dev)); + LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(dev), is_swa); return {dev, &pimpl->gpu_buft_list.at(dev)}; }; @@ -2710,6 +2759,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_OLMO2: { + const int64_t n_embd_head = n_embd / n_head; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output @@ -2724,7 +2775,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_head_kv * n_embd_head}, 0); layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); @@ -3347,6 +3398,146 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_RWKV7: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // Block 0, LN0 + tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); + tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + const int n_lora_decay = hparams.n_lora_decay; + const int n_lora_iclr = hparams.n_lora_iclr; + const int n_lora_value_res_mix = hparams.n_lora_value_res_mix; + const int n_lora_gate = hparams.n_lora_gate; + const int attn_hidden_size = n_embd; + const int ffn_size = hparams.n_ff_arr[0]; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0); + + layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0); + layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0); + + layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0); + layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0); + layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0); + + layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0); + layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0); + layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0); + + if (i == 0) { + // actually not used + layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0); + layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0); + layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0); + } else { + layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0); + layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0); + layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0); + } + + layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, 0); + layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, 0); + + layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0); + + layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0); + layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0); + layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0); + + layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0); + layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0); + layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0); + + layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0); + layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0); + layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0); + + layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0); + + layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0); + layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0); + } + + } break; + case LLM_ARCH_ARWKV7: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + const int n_lora_decay = hparams.n_lora_decay; + const int n_lora_iclr = hparams.n_lora_iclr; + const int n_lora_value_res_mix = hparams.n_lora_value_res_mix; + const int n_lora_gate = hparams.n_lora_gate; + const int attn_hidden_size = n_embd; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.time_mix_w0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W0, "weight", i), {n_embd}, 0); + layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, n_lora_decay}, 0); + layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {n_lora_decay, n_embd}, 0); + + layer.time_mix_a0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A0, "weight", i), {n_embd}, 0); + layer.time_mix_a1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A1, "weight", i), {n_embd, n_lora_iclr}, 0); + layer.time_mix_a2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_A2, "weight", i), {n_lora_iclr, n_embd}, 0); + + if (i == 0) { + // actually not used + layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0); + layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_iclr}, 0); + layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_iclr, n_embd}, 0); + } else { + layer.time_mix_v0 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V0, "weight", i), {n_embd}, 0); + layer.time_mix_v1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V1, "weight", i), {n_embd, n_lora_value_res_mix}, 0); + layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0); + } + + layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + + try { + layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0); + } catch(std::runtime_error & e) { + // ARWKV models may not have gate tensors + layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0); + } + + layer.time_mix_k_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_K, "weight", i), {attn_hidden_size}, 0); + layer.time_mix_k_a = create_tensor(tn(LLM_TENSOR_TIME_MIX_K_A, "weight", i), {attn_hidden_size}, 0); + layer.time_mix_r_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_R_K, "weight", i), {attn_hidden_size}, 0); + + layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0); + layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0); + layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0); + + layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + + } break; case LLM_ARCH_CHAMELEON: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -3696,6 +3887,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str()); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); + LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern); LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k); LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v); LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str()); @@ -3857,7 +4049,7 @@ struct llm_build_llama : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4020,7 +4212,7 @@ struct llm_build_deci : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -4178,7 +4370,7 @@ struct llm_build_baichuan : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr; - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -4296,7 +4488,7 @@ struct llm_build_xverse : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -4404,7 +4596,7 @@ struct llm_build_falcon : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * attn_norm; @@ -4529,7 +4721,7 @@ struct llm_build_grok : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -4683,7 +4875,7 @@ struct llm_build_dbrx : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -4807,7 +4999,7 @@ struct llm_build_starcoder : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -4910,7 +5102,7 @@ struct llm_build_refact : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -5173,7 +5365,7 @@ struct llm_build_bloom : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); inpL = build_norm(inpL, model.tok_norm, @@ -5278,7 +5470,7 @@ struct llm_build_mpt : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); if (model.pos_embd) { // inp_pos - contains the positions @@ -5422,7 +5614,7 @@ struct llm_build_stablelm : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { // norm @@ -5573,7 +5765,7 @@ struct llm_build_qwen : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -5689,7 +5881,7 @@ struct llm_build_qwen2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -5804,7 +5996,7 @@ struct llm_build_qwen2vl : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -5924,7 +6116,7 @@ struct llm_build_qwen2moe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -6073,7 +6265,7 @@ struct llm_build_phi2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { attn_norm_output = build_norm(inpL, @@ -6197,7 +6389,7 @@ struct llm_build_phi3 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, true); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { auto * residual = inpL; @@ -6343,7 +6535,7 @@ struct llm_build_plamo : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { @@ -6451,7 +6643,7 @@ struct llm_build_gpt2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); cb(pos, "pos_embd", -1); @@ -6559,7 +6751,7 @@ struct llm_build_codeshell : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -6672,7 +6864,7 @@ struct llm_build_orion : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -6793,7 +6985,7 @@ struct llm_build_internlm2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -6923,7 +7115,7 @@ struct llm_build_minicpm3 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -7127,7 +7319,7 @@ struct llm_build_gemma : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { // norm @@ -7237,7 +7429,7 @@ struct llm_build_gemma2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, true); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { // norm @@ -7372,17 +7564,13 @@ struct llm_build_gemma3 : public llm_graph_context { ggml_tensor * inp_pos = build_inp_pos(); // TODO: is causal == true correct? might need some changes - auto * inp_attn = build_attn_inp_kv_unified(true, true); - - // "5-to-1 interleaved attention" - // 5 layers of local attention followed by 1 layer of global attention - static const int sliding_window_pattern = 6; + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { - const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + const bool is_swa = hparams.is_swa(il); - const float freq_base_l = is_sliding ? 10000.0f : freq_base; - const float freq_scale_l = is_sliding ? 1.0f : freq_scale; + const float freq_base_l = is_swa ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base; + const float freq_scale_l = is_swa ? hparams.rope_freq_scale_train_swa : cparams.rope_freq_scale; // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); @@ -7505,7 +7693,7 @@ struct llm_build_starcoder2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -7818,7 +8006,7 @@ struct llm_build_command_r : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { @@ -7968,15 +8156,10 @@ struct llm_build_cohere2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, true); - - // sliding window switch pattern - const int32_t sliding_window_pattern = 4; + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { - // three layers sliding window attention (window size 4096) and ROPE - // fourth layer uses global attention without positional embeddings - const bool is_sliding = il % sliding_window_pattern < (sliding_window_pattern - 1); + const bool is_swa = hparams.is_swa(il); // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); @@ -8010,7 +8193,7 @@ struct llm_build_cohere2 : public llm_graph_context { cb(Vcur, "Vcur", il); } - if (is_sliding) { + if (is_swa) { Qcur = ggml_rope_ext(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); @@ -8105,7 +8288,7 @@ struct llm_build_olmo : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -8227,7 +8410,7 @@ struct llm_build_olmo2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -8353,7 +8536,7 @@ struct llm_build_olmoe : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -8476,7 +8659,7 @@ struct llm_build_openelm : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { const int64_t n_head = hparams.n_head(il); @@ -8606,7 +8789,7 @@ struct llm_build_gptneox : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -8752,7 +8935,7 @@ struct llm_build_arctic : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -8884,7 +9067,7 @@ struct llm_build_deepseek : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; @@ -9049,7 +9232,7 @@ struct llm_build_deepseek2 : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -9269,7 +9452,7 @@ struct llm_build_bitnet : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -9527,7 +9710,7 @@ struct llm_build_t5_dec : public llm_graph_context { const int64_t n_outputs_enc = embd_enc->ne[1]; - auto * inp_attn_self = build_attn_inp_kv_unified(true, false); + auto * inp_attn_self = build_attn_inp_kv_unified(); auto * inp_attn_cross = build_attn_inp_cross(); for (int il = 0; il < n_layer; ++il) { @@ -9693,7 +9876,7 @@ struct llm_build_jais : public llm_graph_context { inpL = build_inp_embd(model.tok_embd); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { cur = build_norm(inpL, @@ -9789,7 +9972,7 @@ struct llm_build_chatglm : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -9921,7 +10104,7 @@ struct llm_build_nemotron : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -10044,7 +10227,7 @@ struct llm_build_exaone : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -10201,6 +10384,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { const auto n_tokens = ubatch.n_tokens; const auto n_seqs = ubatch.n_seqs; + const auto n_seq_tokens = ubatch.n_seq_tokens; const auto n_embd = hparams.n_embd; const auto head_size = hparams.wkv_head_size; const auto n_head = n_embd / head_size; @@ -10213,6 +10397,10 @@ struct llm_build_rwkv6_base : public llm_graph_context { bool is_qrwkv = layer.time_mix_first == nullptr; ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + + sx = ggml_reshape_2d(ctx0, sx, n_embd, n_tokens); + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_x), cur); xxx = ggml_reshape_4d( @@ -10355,7 +10543,7 @@ struct llm_build_rwkv6_base : public llm_graph_context { cur = ggml_mul(ctx0, cur, g); cur = build_lora_mm(layer.time_mix_output, cur); - return cur; + return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); } }; @@ -10378,6 +10566,7 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { for (int il = 0; il < n_layer; ++il) { const llama_layer * layer = &model.layers[il]; + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); ggml_tensor * token_shift = build_rwkv_token_shift_load( gf, state_copy, state_mask, ubatch, il @@ -10411,9 +10600,6 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { 1 ); - cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); - cur = ggml_add(ctx0, cur, ffn_inp); - token_shift = ggml_concat(ctx0, ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), @@ -10421,6 +10607,18 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { ); ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids); + ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids); + x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids); + cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids); + } + + cur = build_rwkv6_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV6); + cur = ggml_add(ctx0, cur, ffn_inp); + if (hparams.rescale_every_n_layers != 0 && (il + 1) % hparams.rescale_every_n_layers == 0) { cur = ggml_scale(ctx0, cur, 0.5F); } @@ -10433,12 +10631,6 @@ struct llm_build_rwkv6 : public llm_build_rwkv6_base { } cur = inpL; - - ggml_tensor * inp_out_ids = build_inp_out_ids(); - - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); cb(cur, "result_norm", -1); @@ -10470,10 +10662,9 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { const auto n_seq_tokens = ubatch.n_seq_tokens; const auto n_seqs = ubatch.n_seqs; - inpL = build_inp_embd(model.tok_embd); - for (int il = 0; il < n_layer; ++il) { const llama_layer * layer = &model.layers[il]; + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); ggml_tensor * token_shift = build_rwkv_token_shift_load( gf, state_copy, state_mask, ubatch, il @@ -10497,6 +10688,13 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids); + } + // feed-forward network cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, @@ -10521,10 +10719,358 @@ struct llm_build_rwkv6qwen2 : public llm_build_rwkv6_base { } cur = inpL; - ggml_tensor * inp_out_ids = build_inp_out_ids(); - cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + +struct llm_build_rwkv7_base : public llm_graph_context { + const llama_model & model; + + llm_build_rwkv7_base(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model) { + } + + ggml_tensor * build_rwkv7_channel_mix( + const llama_layer * layer, + ggml_tensor * cur, + ggml_tensor * x_prev, + llm_arch arch) const { + ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + switch (arch) { + case LLM_ARCH_RWKV7: + { + ggml_tensor * xk = ggml_add(ctx0, ggml_mul(ctx0, sx, layer->channel_mix_lerp_k), cur); + + ggml_tensor * k = ggml_sqr( + ctx0, + ggml_relu( + ctx0, + build_lora_mm(layer->channel_mix_key, xk) + ) + ); + + cur = build_lora_mm(layer->channel_mix_value, k); + } break; + default: + GGML_ABORT("fatal error"); + } + + return cur; + } + + ggml_tensor * build_rwkv7_time_mix( + ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * x_prev, + ggml_tensor * state_copy, + ggml_tensor * state_mask, + ggml_tensor *& first_layer_value, + const llama_ubatch & ubatch, + int il) const { + const llama_kv_cache_unified * kv_self = static_cast(memory); + + const auto n_tokens = ubatch.n_tokens; + const auto n_seqs = ubatch.n_seqs; + const auto n_embd = hparams.n_embd; + const auto head_size = hparams.wkv_head_size; + const auto head_count = n_embd / head_size; + const auto n_seq_tokens = ubatch.n_seq_tokens; + + const auto kv_head = kv_self->head; + + const auto & layer = model.layers[il]; + + bool has_gating = layer.time_mix_g1 && layer.time_mix_g2; + + ggml_tensor * sx = ggml_sub(ctx0, x_prev, cur); + ggml_tensor * dummy = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_embd, n_seq_tokens, n_seqs, has_gating ? 6 : 5); + sx = ggml_repeat(ctx0, sx, dummy); + + ggml_tensor * xxx = ggml_add(ctx0, ggml_mul(ctx0, sx, layer.time_mix_lerp_fused), cur); + + ggml_tensor * xr = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], 0); + ggml_tensor * xw = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float)); + ggml_tensor * xk = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float)); + ggml_tensor * xv = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float)); + ggml_tensor * xa = ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float)); + ggml_tensor * xg = has_gating ? ggml_view_2d(ctx0, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 5 * sizeof(float)) : nullptr; + + ggml_tensor * r = build_lora_mm(layer.time_mix_receptance, xr); + ggml_tensor * w = ggml_add( + ctx0, + ggml_mul_mat(ctx0, layer.time_mix_w2, ggml_tanh(ctx0, ggml_mul_mat(ctx0, layer.time_mix_w1, xw))), + layer.time_mix_w0 + ); + w = ggml_exp(ctx0, ggml_scale(ctx0, ggml_sigmoid(ctx0, w), -0.606531)); + + ggml_tensor * k = build_lora_mm(layer.time_mix_key, xk); + ggml_tensor * v = build_lora_mm(layer.time_mix_value, xv); + if (first_layer_value == nullptr) { + first_layer_value = v; + } else { + // Add the first layer value as a residual connection. + v = ggml_add(ctx0, v, + ggml_mul(ctx0, + ggml_sub(ctx0, first_layer_value, v), + ggml_sigmoid(ctx0, ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.time_mix_v2, ggml_mul_mat(ctx0, layer.time_mix_v1, xv)), + layer.time_mix_v0 + ) + ) + ) + ); + } + + ggml_tensor * g = nullptr; + if (layer.time_mix_g1 && layer.time_mix_g2) { + g = ggml_mul_mat(ctx0, layer.time_mix_g2, ggml_sigmoid(ctx0, ggml_mul_mat(ctx0, layer.time_mix_g1, xg))); + } + + ggml_tensor * a = ggml_sigmoid(ctx0, + ggml_add( + ctx0, + ggml_mul_mat(ctx0, layer.time_mix_a2, ggml_mul_mat(ctx0, layer.time_mix_a1, xa)), + layer.time_mix_a0 + ) + ); + + ggml_tensor * kk = ggml_reshape_3d(ctx0, ggml_mul(ctx0, k, layer.time_mix_k_k), head_size, head_count, n_tokens); + kk = ggml_l2_norm(ctx0, kk, 1e-12); + + ggml_tensor * ka = ggml_mul(ctx0, k, layer.time_mix_k_a); + k = ggml_add(ctx0, k, ggml_sub(ctx0, ggml_mul(ctx0, a, ka), ka)); + + r = ggml_reshape_3d(ctx0, r, head_size, head_count, n_tokens); + w = ggml_reshape_3d(ctx0, w, head_size, head_count, n_tokens); + k = ggml_reshape_3d(ctx0, k, head_size, head_count, n_tokens); + v = ggml_reshape_3d(ctx0, v, head_size, head_count, n_tokens); + a = ggml_reshape_3d(ctx0, a, head_size, head_count, n_tokens); + + ggml_tensor * wkv_state = build_copy_mask_state( + gf, kv_self->v_l[il], state_copy, state_mask, + hparams.n_embd_v_s(), n_seqs); + + ggml_tensor * wkv_output = ggml_rwkv_wkv7(ctx0, r, w, k, v, ggml_neg(ctx0, kk), ggml_mul(ctx0, kk, a), wkv_state); + cur = ggml_view_1d(ctx0, wkv_output, n_embd * n_tokens, 0); + wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); + + ggml_build_forward_expand( + gf, + ggml_cpy( + ctx0, + wkv_state, + ggml_view_1d( + ctx0, + kv_self->v_l[il], + hparams.n_embd_v_s() * n_seqs, + hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self->v_l[il]) + ) + ) + ); + + if (layer.time_mix_ln && layer.time_mix_ln_b) { + // group norm with head_count groups + cur = ggml_reshape_3d(ctx0, cur, n_embd / head_count, head_count, n_tokens); + cur = ggml_norm(ctx0, cur, 64e-5f); + + // Convert back to regular vectors. + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.time_mix_ln), layer.time_mix_ln_b); + } else { + cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens); + } + + ggml_tensor * rk = ggml_sum_rows(ctx0, + ggml_mul(ctx0, ggml_mul(ctx0, k, r), ggml_reshape_2d(ctx0, layer.time_mix_r_k, head_size, head_count))); + cur = ggml_add(ctx0, cur, ggml_reshape_2d(ctx0, ggml_mul(ctx0, v, rk), n_embd, n_tokens)); + + if (has_gating) { + cur = ggml_mul(ctx0, cur, g); + } + cur = build_lora_mm(layer.time_mix_output, cur); + + return ggml_reshape_3d(ctx0, cur, n_embd, n_seq_tokens, n_seqs); + } +}; + +struct llm_build_rwkv7 : public llm_build_rwkv7_base { + llm_build_rwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) { + GGML_ASSERT(hparams.token_shift_count == 2); + + ggml_tensor * cur; + ggml_tensor * inpL; + ggml_tensor * v_first = nullptr; + + inpL = build_inp_embd(model.tok_embd); + inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1); + + ggml_tensor * state_copy = build_inp_s_copy(); + ggml_tensor * state_mask = build_inp_s_mask(); + + const auto n_embd = hparams.n_embd; + const auto n_seq_tokens = ubatch.n_seq_tokens; + const auto n_seqs = ubatch.n_seqs; + + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + + ggml_tensor * token_shift = build_rwkv_token_shift_load( + gf, state_copy, state_mask, ubatch, il + ); + + ggml_tensor * att_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], 0); + ggml_tensor * ffn_shift = ggml_view_3d(ctx0, token_shift, n_embd, 1, n_seqs, token_shift->nb[1], token_shift->nb[2], n_embd * ggml_element_size(token_shift)); + + ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM, il); + cb(att_norm, "attn_norm", il); + + ggml_tensor * x_prev = ggml_concat( + ctx0, + att_shift, + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), + 1 + ); + + cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il); + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + ggml_tensor * ffn_norm = build_norm(ffn_inp, layer->attn_norm_2, layer->attn_norm_2_b, LLM_NORM, il); + cb(ffn_norm, "ffn_norm", il); + + x_prev = ggml_concat( + ctx0, + ffn_shift, + ggml_view_3d(ctx0, ffn_norm, n_embd, n_seq_tokens - 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], 0), + 1 + ); + + token_shift = ggml_concat(ctx0, + ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)), + ggml_view_3d(ctx0, ffn_norm, n_embd, 1, n_seqs, ffn_norm->nb[1], ffn_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(ffn_norm)), + 1 + ); + ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids); + ffn_norm = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_norm, n_embd, n_tokens), inp_out_ids); + x_prev = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, x_prev, n_embd, n_tokens), inp_out_ids); + } + + cur = build_rwkv7_channel_mix(layer, ffn_norm, x_prev, LLM_ARCH_RWKV7); + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + + +struct llm_build_arwkv7 : public llm_build_rwkv7_base { + llm_build_arwkv7(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_build_rwkv7_base(model, params) { + GGML_ASSERT(n_embd == hparams.n_embd_k_s()); + + ggml_tensor * cur; + ggml_tensor * inpL; + ggml_tensor * v_first = nullptr; + + inpL = build_inp_embd(model.tok_embd); + + ggml_tensor * state_copy = build_inp_s_copy(); + ggml_tensor * state_mask = build_inp_s_mask(); + + const auto n_embd = hparams.n_embd; + const auto n_seq_tokens = ubatch.n_seq_tokens; + const auto n_seqs = ubatch.n_seqs; + + for (int il = 0; il < n_layer; ++il) { + const llama_layer * layer = &model.layers[il]; + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs); + + ggml_tensor * token_shift = build_rwkv_token_shift_load( + gf, state_copy, state_mask, ubatch, il + ); + + ggml_tensor * att_norm = build_norm(inpL, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, il); + cb(att_norm, "attn_norm", il); + + ggml_tensor * x_prev = ggml_concat( + ctx0, + token_shift, + ggml_view_3d(ctx0, att_norm, n_embd, n_seq_tokens - 1, n_seqs, att_norm->nb[1], att_norm->nb[2], 0), + 1 + ); + + cur = build_rwkv7_time_mix(gf, att_norm, x_prev, state_copy, state_mask, v_first, ubatch, il); + + token_shift = ggml_view_3d(ctx0, att_norm, n_embd, 1, n_seqs, att_norm->nb[1], att_norm->nb[2], (n_seq_tokens-1)*n_embd*ggml_element_size(att_norm)); + ggml_build_forward_expand(gf, build_rwkv_token_shift_store(token_shift, ubatch, il)); + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, cur, n_embd, n_tokens), inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ggml_reshape_2d(ctx0, ffn_inp, n_embd, n_tokens), inp_out_ids); + } + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); @@ -10560,7 +11106,7 @@ struct llm_build_chameleon : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(true, false); + auto * inp_attn = build_attn_inp_kv_unified(); for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -10872,9 +11418,11 @@ llama_memory_i * llama_model::create_memory() const { llama_memory_i * res; switch (arch) { + case LLM_ARCH_MAMBA: case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: - case LLM_ARCH_MAMBA: + case LLM_ARCH_RWKV7: + case LLM_ARCH_ARWKV7: { res = new llama_kv_cache_unified(hparams, { /*.get_rope_factors =*/ nullptr @@ -11121,6 +11669,14 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_RWKV7: + { + llm = std::make_unique(*this, params, gf); + } break; + case LLM_ARCH_ARWKV7: + { + llm = std::make_unique(*this, params, gf); + } break; case LLM_ARCH_CHAMELEON: { llm = std::make_unique(*this, params, gf); @@ -11234,6 +11790,8 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_JAIS: case LLM_ARCH_RWKV6: case LLM_ARCH_RWKV6QWEN2: + case LLM_ARCH_RWKV7: + case LLM_ARCH_ARWKV7: case LLM_ARCH_WAVTOKENIZER_DEC: return LLAMA_ROPE_TYPE_NONE; @@ -11388,6 +11946,8 @@ bool llama_model_is_recurrent(const llama_model * model) { case LLM_ARCH_MAMBA: return true; case LLM_ARCH_RWKV6: return true; case LLM_ARCH_RWKV6QWEN2: return true; + case LLM_ARCH_RWKV7: return true; + case LLM_ARCH_ARWKV7: return true; default: return false; } } diff --git a/src/llama-model.h b/src/llama-model.h index 55c26a92b..a9da1215a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -29,6 +29,7 @@ enum llm_type { LLM_TYPE_109M, LLM_TYPE_137M, LLM_TYPE_160M, + LLM_TYPE_190M, LLM_TYPE_220M, LLM_TYPE_250M, LLM_TYPE_270M, @@ -45,6 +46,7 @@ enum llm_type { LLM_TYPE_1_6B, LLM_TYPE_2B, LLM_TYPE_2_8B, + LLM_TYPE_2_9B, LLM_TYPE_3B, LLM_TYPE_4B, LLM_TYPE_6B, @@ -260,6 +262,20 @@ struct llama_layer { struct ggml_tensor * time_mix_receptance_b = nullptr; struct ggml_tensor * time_mix_gate = nullptr; + // rwkv7 + struct ggml_tensor * time_mix_w0 = nullptr; + struct ggml_tensor * time_mix_a0 = nullptr; + struct ggml_tensor * time_mix_a1 = nullptr; + struct ggml_tensor * time_mix_a2 = nullptr; + struct ggml_tensor * time_mix_v0 = nullptr; + struct ggml_tensor * time_mix_v1 = nullptr; + struct ggml_tensor * time_mix_v2 = nullptr; + struct ggml_tensor * time_mix_g1 = nullptr; + struct ggml_tensor * time_mix_g2 = nullptr; + struct ggml_tensor * time_mix_k_k = nullptr; + struct ggml_tensor * time_mix_k_a = nullptr; + struct ggml_tensor * time_mix_r_k = nullptr; + struct ggml_tensor * time_mix_ln = nullptr; struct ggml_tensor * time_mix_ln_b = nullptr; struct ggml_tensor * time_mix_output = nullptr; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fb7982655..09eb57077 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -756,10 +756,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ssm_conv1d.weight") == std::string::npos; - // do not quantize RWKV's time_mix_first tensors + // do not quantize RWKV's small yet 2D weights quantize &= name.find("time_mix_first.weight") == std::string::npos; + quantize &= name.find("time_mix_w0.weight") == std::string::npos; quantize &= name.find("time_mix_w1.weight") == std::string::npos; quantize &= name.find("time_mix_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_v0.weight") == std::string::npos; + quantize &= name.find("time_mix_v1.weight") == std::string::npos; + quantize &= name.find("time_mix_v2.weight") == std::string::npos; + quantize &= name.find("time_mix_a0.weight") == std::string::npos; + quantize &= name.find("time_mix_a1.weight") == std::string::npos; + quantize &= name.find("time_mix_a2.weight") == std::string::npos; + quantize &= name.find("time_mix_g1.weight") == std::string::npos; + quantize &= name.find("time_mix_g2.weight") == std::string::npos; quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c86ffb64e..adb749bd5 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1916,6 +1916,40 @@ struct test_gla : public test_case { } }; +// GGML_OP_RWKV_WKV7 +struct test_rwkv_wkv7 : public test_case { + const ggml_type type; + + const int64_t head_count; + const int64_t head_size; + const int64_t n_seq_tokens; + const int64_t n_seqs; + + std::string vars() override { + return VARS_TO_STR5(type, head_count, head_size, n_seq_tokens, n_seqs); + } + + test_rwkv_wkv7(ggml_type type = GGML_TYPE_F32, + int64_t head_count = 32, int64_t head_size = 64, int64_t n_seq_tokens = 32, int64_t n_seqs = 32) + : type(type), head_count(head_count), head_size(head_size), n_seq_tokens(n_seq_tokens), n_seqs(n_seqs) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + const int64_t n_tokens = n_seq_tokens * n_seqs; + ggml_tensor * r = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * w = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * k = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * v = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * a = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + ggml_tensor * b = ggml_new_tensor(ctx, type, 3, std::vector{ head_size, head_count, n_tokens }.data()); + // Outputs may become NaN with long seqlen without these normalization + a = ggml_l2_norm(ctx, a, 1e-7F); + b = ggml_l2_norm(ctx, b, 1e-7F); + ggml_tensor * s = ggml_new_tensor(ctx, type, 2, std::vector{ head_size * head_size * head_count, n_seqs }.data()); + ggml_tensor * out = ggml_rwkv_wkv7(ctx, r, w, k, v, a, b, s); + return out; + } +}; + // GGML_OP_MUL_MAT struct test_mul_mat : public test_case { const ggml_type type_a; @@ -2972,6 +3006,32 @@ struct test_group_norm : public test_case { } }; +// GGML_OP_L2_NORM +struct test_l2_norm : public test_case { + const ggml_type type; + const std::array ne; + const float eps; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_l2_norm(ggml_type type = GGML_TYPE_F32, + std::array ne = {64, 64, 320, 1}, + float eps = 1e-12f) + : type(type), ne(ne), eps(eps) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_l2_norm(ctx, a, eps); + ggml_set_name(out, "out"); + + return out; + } +}; + // GGML_OP_ACC struct test_acc : public test_case { const ggml_type type; @@ -4006,8 +4066,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, v, eps)); } test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); + test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps)); } + test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f)); + test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 1, 1}, {4, 1536, 1, 1})); test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {8, 1536, 1, 1}, {4, 1536, 1, 1})); test_cases.emplace_back(new test_ssm_conv(GGML_TYPE_F32, {4, 1536, 4, 1}, {4, 1536, 1, 1})); @@ -4019,6 +4082,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 32, 4)); test_cases.emplace_back(new test_rwkv_wkv6(GGML_TYPE_F32, 32, 64, 128, 4)); + test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 1, 1)); + test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 32, 1)); + test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 32, 4)); + test_cases.emplace_back(new test_rwkv_wkv7(GGML_TYPE_F32, 32, 64, 128, 4)); + test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 1, 1)); test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 1)); test_cases.emplace_back(new test_gla(GGML_TYPE_F32, 32, 64, 32, 4));