slm_ikllama_sys 0.1.1

//
// Copyright (C) 2023-2025 The llama.cpp authors
// Copyright (C) 2024-2025 Iwan Kawrakow
// MIT license
// SPDX-License-Identifier: MIT
//

#include "llama-impl.h"
#include "llama-vocab.h"
#include "llama-grammar.h"
#include "llama-sampling.h"
#include "llama-arch.h"
#include "llama-mmap.h"
#include "llama-model-loader.h"
#include "llama-model.h"
#include "llama-build-context.h"
#include "llama-cparams.h"
#include "llama-hparams.h"
#include "llama-context.h"
#include "llama-spec-features.h"
#include "llama-quantize.h"

#include "unicode.h"

#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"

void llama_set_mtp_target_context(struct llama_context * ctx, struct llama_context * target_ctx);

// TODO: fix these includes
#include "iqk/iqk_quantize.h"
#include "iqk/iqk_cpu_ops.h"

#define IK_PRINT_TIMING 0

#ifdef GGML_USE_RPC
#  include "ggml-rpc.h"
#endif

#ifdef GGML_USE_CUDA
#  include "ggml-cuda.h"
#elif defined(GGML_USE_VULKAN)
#  include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
#  include "ggml-sycl.h"
#elif defined(GGML_USE_KOMPUTE)
#   include "ggml-kompute.h"
#elif defined(GGML_USE_CANN)
#   include "ggml-cann.h"
#endif

#ifdef GGML_USE_BLAS
#  include "ggml-blas.h"
#endif

#ifdef GGML_USE_METAL
#  include "ggml-metal.h"
#endif

#ifdef __has_include
    #if __has_include(<unistd.h>)
        #include <unistd.h>
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
            #include <fcntl.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
        #endif
    #endif
#endif

#if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #ifndef PATH_MAX
        #define PATH_MAX MAX_PATH
    #endif
    #include <io.h>
#endif

//#if __cplusplus >= 202000L
//    #define LU8(x) (const char*)(u8##x)
//#else
//    #define LU8(x) u8##x
//#endif
#define LU8(x) (const char*)(u8##x)
#include <algorithm>
#include <array>
#include <cassert>
#include <cctype>
#include <cfloat>
#include <cinttypes>
#include <climits>
#include <cmath>
#include <cstdarg>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <functional>
#include <future>
#include <initializer_list>
#include <locale>
#include <map>
#include <memory>
#include <mutex>
#include <numeric>
#include <set>
#include <unordered_set>
#include <sstream>
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <regex>
#include <tuple>

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

// bump if necessary
#define LLAMA_MAX_LAYERS  512

//
// helpers
//


static bool is_utf8_whitespace(uint8_t c) {
    // Basic ASCII whitespace
    if (c <= 0x7F) return isspace(c);
    // Else: Not whitespace (or you'd need a full Unicode table)
    return false;
}

static std::string trim(const std::string & str) {
    size_t start = 0;
    size_t end = str.size();
    while (start < end && is_utf8_whitespace(str[start])) start++;
    while (end > start && is_utf8_whitespace(str[end - 1])) end--;
    return str.substr(start, end - start);
}

static bool stop_internal_decode = false;

void  llama_decode_reset() {
    stop_internal_decode = false;
}

void  llama_decode_stop() {
    stop_internal_decode = true;
}

static std::vector<std::string> string_split(const std::string& str, const std::string& delimiter) {
    std::vector<std::string> parts;
    size_t start = 0;
    size_t end = str.find(delimiter);
    while (end != std::string::npos) {
        parts.push_back(str.substr(start, end - start));
        start = end + delimiter.length();
        end = str.find(delimiter, start);
    }
    parts.push_back(str.substr(start));
    return parts;
}

// extract ip and port from RPC[ip:port] for rpc and keep other device names
static std::vector<rpc_device>  extract_device_from_rpc_device(std::vector<std::string> devices) {
    std::vector<rpc_device> rpc_servers;
    for (auto & device : devices) {
        rpc_device rpc;
        auto value = string_split(device, "|");
        if (value.size() == 2) {
            rpc.device = std::stoi(value[1]);
            rpc.endpoint = value[0];
        }
        rpc_servers.push_back(rpc);
    }
    return rpc_servers;
}


enum llm_chat_template {
    LLM_CHAT_TEMPLATE_CHATML,
    LLM_CHAT_TEMPLATE_LLAMA_2,
    LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
    LLM_CHAT_TEMPLATE_MISTRAL_V1,
    LLM_CHAT_TEMPLATE_MISTRAL_V3,
    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
    LLM_CHAT_TEMPLATE_MISTRAL_V7,
    LLM_CHAT_TEMPLATE_PHI_3,
    LLM_CHAT_TEMPLATE_FALCON_3,
    LLM_CHAT_TEMPLATE_FALCON_E,
    LLM_CHAT_TEMPLATE_ZEPHYR,
    LLM_CHAT_TEMPLATE_MONARCH,
    LLM_CHAT_TEMPLATE_GEMMA,
    LLM_CHAT_TEMPLATE_ORION,
    LLM_CHAT_TEMPLATE_OPENCHAT,
    LLM_CHAT_TEMPLATE_VICUNA,
    LLM_CHAT_TEMPLATE_VICUNA_ORCA,
    LLM_CHAT_TEMPLATE_DEEPSEEK,
    LLM_CHAT_TEMPLATE_DEEPSEEK_2,
    LLM_CHAT_TEMPLATE_DEEPSEEK_3,
    LLM_CHAT_TEMPLATE_COMMAND_R,
    LLM_CHAT_TEMPLATE_LLAMA_3,
    LLM_CHAT_TEMPLATE_CHATGLM_3,
    LLM_CHAT_TEMPLATE_CHATGLM_4,
    LLM_CHAT_TEMPLATE_MINICPM,
    LLM_CHAT_TEMPLATE_EXAONE_3,
    LLM_CHAT_TEMPLATE_RWKV_WORLD,
    LLM_CHAT_TEMPLATE_GRANITE,
    LLM_CHAT_TEMPLATE_GIGACHAT,
    LLM_CHAT_TEMPLATE_MEGREZ,
    LLM_CHAT_TEMPLATE_LLAMA4,
    LLM_CHAT_TEMPLATE_BITNET,
    LLM_CHAT_TEMPLATE_DOTS1,
    LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
    LLM_CHAT_TEMPLATE_KIMI_K2,
    LLM_CHAT_TEMPLATE_OPENAI_MOE,
    LLM_CHAT_TEMPLATE_GROK_2,
    LLM_CHAT_TEMPLATE_BAILING,
    LLM_CHAT_TEMPLATE_BAILING_THINK,
    LLM_CHAT_TEMPLATE_BAILING2,
    LLM_CHAT_TEMPLATE_SEED_OSS,
    LLM_CHAT_TEMPLATE_UNKNOWN,
};

static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "chatml",            LLM_CHAT_TEMPLATE_CHATML            },
    { "llama2",            LLM_CHAT_TEMPLATE_LLAMA_2           },
    { "llama2-sys",        LLM_CHAT_TEMPLATE_LLAMA_2_SYS       },
    { "llama2-sys-bos",    LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS   },
    { "llama2-sys-strip",  LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
    { "mistral-v1",        LLM_CHAT_TEMPLATE_MISTRAL_V1        },
    { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
    { "falcon_e",          LLM_CHAT_TEMPLATE_FALCON_E          },
    { "zephyr",            LLM_CHAT_TEMPLATE_ZEPHYR            },
    { "monarch",           LLM_CHAT_TEMPLATE_MONARCH           },
    { "gemma",             LLM_CHAT_TEMPLATE_GEMMA             },
    { "orion",             LLM_CHAT_TEMPLATE_ORION             },
    { "openchat",          LLM_CHAT_TEMPLATE_OPENCHAT          },
    { "vicuna",            LLM_CHAT_TEMPLATE_VICUNA            },
    { "vicuna-orca",       LLM_CHAT_TEMPLATE_VICUNA_ORCA       },
    { "deepseek",          LLM_CHAT_TEMPLATE_DEEPSEEK          },
    { "deepseek2",         LLM_CHAT_TEMPLATE_DEEPSEEK_2        },
    { "deepseek3",         LLM_CHAT_TEMPLATE_DEEPSEEK_3        },
    { "command-r",         LLM_CHAT_TEMPLATE_COMMAND_R         },
    { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGLM_3         },
    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGLM_4         },
    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
    { "rwkv-world",        LLM_CHAT_TEMPLATE_RWKV_WORLD        },
    { "granite",           LLM_CHAT_TEMPLATE_GRANITE           },
    { "gigachat",          LLM_CHAT_TEMPLATE_GIGACHAT          },
    { "megrez",            LLM_CHAT_TEMPLATE_MEGREZ            },
    { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
    { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
    { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
    { "gpt-oss",           LLM_CHAT_TEMPLATE_OPENAI_MOE        },
    { "bitnet",            LLM_CHAT_TEMPLATE_BITNET            },
    { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
    { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
    { "bailing-think",     LLM_CHAT_TEMPLATE_BAILING_THINK     },
    { "bailing2",          LLM_CHAT_TEMPLATE_BAILING2          },
    { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },

};

//
// gguf helpers
//

static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
    switch (type) {
        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
        default:                return format("unknown type %d", type);
    }
}

std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i) {
    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);

    switch (type) {
        case GGUF_TYPE_STRING:
            return gguf_get_val_str(ctx_gguf, i);
        case GGUF_TYPE_ARRAY:
            {
                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
                int arr_n = gguf_get_arr_n(ctx_gguf, i);
                const void * data = gguf_get_arr_data(ctx_gguf, i);
                std::stringstream ss;
                ss << "[";
                for (int j = 0; j < arr_n; j++) {
                    if (arr_type == GGUF_TYPE_STRING) {
                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
                        // escape quotes
                        replace_all(val, "\\", "\\\\");
                        replace_all(val, "\"", "\\\"");
                        ss << '"' << val << '"';
                    } else if (arr_type == GGUF_TYPE_ARRAY) {
                        ss << "???";
                    } else {
                        ss << gguf_data_to_str(arr_type, data, j);
                    }
                    if (j < arr_n - 1) {
                        ss << ", ";
                    }
                }
                ss << "]";
                return ss.str();
            }
        default:
            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
    }
}

//
// llama helpers
//

ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
    ggml_backend_buffer_type_t buft = nullptr;

#if defined(GGML_USE_CUDA)
    // host buffers should only be used when data is expected to be copied to/from the GPU
    if (host_buffer) {
        buft = ggml_backend_cuda_host_buffer_type();
    }
#elif defined(GGML_USE_SYCL)
    if (host_buffer) {
        buft = ggml_backend_sycl_host_buffer_type();
    }
#elif defined(GGML_USE_CPU_HBM)
    buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)
    if (host_buffer) {
        buft = ggml_backend_vk_host_buffer_type();
    }
#endif

    if (buft == nullptr) {
        buft = ggml_backend_cpu_buffer_type();
    }
    return buft;

    GGML_UNUSED(host_buffer);
}

//
// globals
//

struct llama_state {
    llama_state() {
#ifdef GGML_USE_METAL
        ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
#elif defined(GGML_USE_CUDA)
        ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
#elif defined(GGML_USE_CANN)
        ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
#endif
    }

    // We save the log callback globally
    ggml_log_callback log_callback = llama_log_callback_default;
    void * log_callback_user_data = nullptr;
};

static llama_state g_state;

static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB;

static const char * llama_expert_gating_func_name(llm_expert_gating_func_type type) {
    switch (type) {
        case LLM_EXPERT_GATING_FUNC_SOFTMAX: return "softmax";
        case LLM_EXPERT_GATING_FUNC_SIGMOID: return "sigmoid";
        case LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT: return "softmax_weight";
        default:                             return "unknown";
    }
}

llama_model::~llama_model() {
    for (struct ggml_context * ctx : ctxs) {
        ggml_free(ctx);
    }
    for (ggml_backend_buffer_t buf : bufs) {
#ifdef GGML_USE_CUDA
        if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
            ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
        }
#endif
        ggml_backend_buffer_free(buf);
    }
    while (!lora_adapters.empty()) {
        llama_lora_adapter_free(*lora_adapters.begin());
    }
}

static size_t llama_get_device_count(const llama_model & model) {
    size_t count = 1;
#if defined(GGML_USE_CUDA)
    count = ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
    count = ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
    count = ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_CANN)
    return ggml_backend_cann_get_device_count();
#endif
#if defined(GGML_USE_RPC)
    count += model.rpc_servers.size();
#endif
    return count;
    GGML_UNUSED(model);
}

static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
    ggml_backend_buffer_type_t buft = nullptr;

#if defined(GGML_USE_RPC)
    int dev_count = (int)llama_get_device_count(model);
    int rpc_count = (int)model.rpc_servers.size();
    if (gpu >= dev_count - rpc_count) {
        int rpc_idx = gpu - dev_count + rpc_count;
        rpc_device rpc = model.rpc_servers[rpc_idx];
        const char * endpoint = rpc.endpoint.c_str();
        return ggml_backend_rpc_buffer_type(endpoint, rpc.device);
    }
#endif
#if defined(GGML_USE_METAL)
    buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
    buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
    buft = ggml_backend_vk_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
    buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_KOMPUTE)
    buft = ggml_backend_kompute_buffer_type(gpu);
    if (buft == nullptr) {
        LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
    }
#elif defined(GGML_USE_CANN)
    buft = ggml_backend_cann_buffer_type(gpu);
#endif

    if (buft == nullptr) {
        buft = llama_default_buffer_type_cpu(true);
    }
    return buft;
    GGML_UNUSED(model);
    GGML_UNUSED(gpu);
}

static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu) {
    ggml_backend_buffer_type_t buft = nullptr;

#ifdef GGML_USE_CUDA
    if (ggml_backend_cuda_get_device_count() > 1) {
        buft = ggml_backend_cuda_split_buffer_type(model.splits.data());
    }
#endif

#ifdef GGML_USE_SYCL
    if (ggml_backend_sycl_get_device_count() > 1) {
        buft = ggml_backend_sycl_split_buffer_type(model.splits.data());
    }
#endif

    if (buft == nullptr) {
        buft = llama_default_buffer_type_offload(model, fallback_gpu);
    }
    return buft;

}

int llama_model::device_count() const {
    return llama_get_device_count(*this);
}

ggml_backend_buffer_type_t llama_model::default_buffer_type_offload(int device) const {
    return llama_default_buffer_type_offload(*this, device);
}

static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
    int dev_count = (int)llama_get_device_count(model);
    int rpc_count = (int)model.rpc_servers.size();
    if (device >= dev_count - rpc_count) {
        size_t total;
        size_t free;
        rpc_device rpc = model.rpc_servers[device - dev_count + rpc_count];
        const char * endpoint = rpc.endpoint.c_str();
        ggml_backend_rpc_get_device_memory(endpoint, rpc.device, &free, &total);
        return free;
    }
#endif
#if defined(GGML_USE_CUDA)
    size_t total;
    size_t free;
    ggml_backend_cuda_get_device_memory(device, &free, &total);
    return free;
#elif defined(GGML_USE_SYCL)
    size_t total;
    size_t free;
    ggml_backend_sycl_get_device_memory(device, &free, &total);
    return free;
#elif defined(GGML_USE_VULKAN)
    size_t total;
    size_t free;
    ggml_backend_vk_get_device_memory(device, &free, &total);
    return free;
#elif defined(GGML_USE_CANN)
    size_t total;
    size_t free;
    ggml_backend_cann_get_device_memory(device, &free, &total);
    return free;
#else
    return 1;
#endif
    GGML_UNUSED(model);
    GGML_UNUSED(device);
}

struct llama_context::Prev {
    int all_seq_id;
    int n_outputs;
    int n_kv;
    int n_tokens;
    int save_per_step_ssm;
    int per_step_max_allocated;
    llama_mtp_op_type mtp_op_type;
    ggml_cgraph * graph;
};

void llama_context::reset_scheduler() {
    ggml_backend_sched_reset(sched);
    prev.reset();
    prev_mtp.reset();
}

bool llama_context::can_reuse_graph(const llama_batch & u_batch) {
    if (!cparams.graph_reuse) return false;
    //if (kv_self.save_per_step_ssm) return false;
    if ((model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && mtp_target_ctx != nullptr) return false;
    auto the_prev = cparams.mtp_op_type == MTP_OP_NONE ? prev.get() : prev_mtp.get();
    if (!the_prev || !the_prev->graph) return false;
    //if (u_batch.n_tokens > 1) return false;
    if (u_batch.embd) return false;
    if (the_prev->save_per_step_ssm != kv_self.save_per_step_ssm ||
        the_prev->per_step_max_allocated != kv_self.ckpt.per_step_max_allocated) return false;
    return u_batch.all_seq_id == the_prev->all_seq_id &&
           kv_self.head > 0 &&
           kv_self.n == the_prev->n_kv &&
           n_outputs == the_prev->n_outputs &&
           u_batch.n_tokens == the_prev->n_tokens &&
           cparams.mtp_op_type == the_prev->mtp_op_type &&
           update_cache_copies();
}

/*
static void why_not_reuse_previous(const llama_batch & u_batch, const llama_context & ctx, const llama_context::Prev * the_prev) {
    if (!the_prev) { printf("    previous is null\n"); return; }
    if (!the_prev->graph) { printf("    previous graph is null\n"); return; }
    if (!ctx.cparams.graph_reuse) { printf("    graph_reuse is false\n"); return; }
    if (u_batch.embd) { printf("    ubatch.embd is not null\n"); return; }
    if (u_batch.all_seq_id != the_prev->all_seq_id) { printf("    all_seq_id is not the same\n"); return; }
    if (ctx.kv_self.head == 0) { printf("    kv_self.head = 0\n"); return; }
    if (ctx.kv_self.n != the_prev->n_kv) { printf("    kv_self.n is not the same\n"); return; }
    if (ctx.n_outputs != the_prev->n_outputs) { printf("    n_outputs is not the same\n"); return; }
    if (u_batch.n_tokens != the_prev->n_tokens) { printf("    n_tokens is not the same\n"); return; }
    if (ctx.cparams.mtp_op_type != the_prev->mtp_op_type) { printf("    mtp_op_type is not the same\n"); return; }
    printf("    update_cache_copies() must have failed\n");
}
*/

bool llama_context::update_cache_copies() {
    const int n_layer = model.mtp && cparams.mtp_op_type != MTP_OP_NONE ?
        model.hparams.n_layer : model.hparams.n_layer - model.hparams.nextn_predict_layers; //cache_copies.size()/2;
    auto layer_has_attention_kv = [&](int il) {
        return !model.hparams.is_recurrent(il);
    };

    if ((int)kv_self.k_l.size() < n_layer) {
        return false;
    }
    if (!kv_self.v_l.empty() && (int)kv_self.v_l.size() < n_layer) {
        return false;
    }
    for (int il = 0; il < n_layer; ++il) {
        if (!layer_has_attention_kv(il) || kv_self.k_l[il] == nullptr) {
            continue;
        }
        auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra;
        if (kl) {
            GGML_ASSERT(model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN);
            GGML_ASSERT(model.splits.size() > 1);
            auto vl = !kv_self.v_l.empty() && kv_self.v_l[il] ? (ggml_split_tensor_t *)kv_self.v_l[il]->extra : nullptr;
            GGML_ASSERT(kl && (kv_self.v_l.empty() || !kv_self.v_l[il] || vl));
            if (vl) {
                GGML_ASSERT(kl->n_device == vl->n_device);
            }
            for (int id = 0; id < kl->n_device; ++id) {
                if (!kl->splits[id]) continue;
                size_t idx = 2*model.splits.size()*il + 2*id + 0;
                auto& c = cache_copies[idx];
                if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kl->splits[id]) {
                    return false;
                }
                c.cpy->view_offs = kv_self.head*c.step;
                c.cpy->src[1]->data = (char *)kl->splits[id]->data + c.cpy->view_offs;
                c.cpy->data = c.cpy->src[1]->data;
            }
            if (!vl) continue;
            for (int id = 0; id < vl->n_device; ++id) {
                if (!vl->splits[id]) continue;
                auto& c = cache_copies[2*model.splits.size()*il + 2*id + 1];
                if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != vl->splits[id]) {
                    return false;
                }
                c.cpy->view_offs = kv_self.head*c.step;
                c.cpy->src[1]->data = (char *)vl->splits[id]->data + c.cpy->view_offs;
                c.cpy->data = c.cpy->src[1]->data;
            }
        } else {
            auto& c = cache_copies[2*il+0];
            if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) {
                return false;
            }
            c.cpy->view_offs = kv_self.head*c.step;
            c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs;
            c.cpy->data = c.cpy->src[1]->data;
            if (!kv_self.v_l.empty() && kv_self.v_l[il]) {
                auto& c = cache_copies[2*il+1];
                if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) {
                    return false;
                }
                c.cpy->view_offs = kv_self.head*c.step;
                c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs;
                c.cpy->data = c.cpy->src[1]->data;
            }
        }
    }
    return true;
}

llama_context::llama_context(const llama_model & model)
    : model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {
    const auto & hparams = model.hparams;
    if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) {
        cache_copies.resize(2*model.splits.size()*hparams.n_layer);
    } else {
        cache_copies.resize(2*hparams.n_layer);
    }
}

void llama_context::set_mtp_op_type(llama_mtp_op_type value) {
    LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);

    cparams.mtp_op_type = value;
}

llama_context::~llama_context() {
    ggml_backend_sched_free(sched);

    for (ggml_backend_t backend : backends) {
        ggml_backend_free(backend);
    }

    ggml_backend_buffer_free(buf_output);
}

int llama_context::max_nodes(int n_tokens, int n_kv) const {
    int max_nodes = model.max_nodes(n_tokens);
    if (model.is_mla_model() &&
        cparams.mla_attn > 1 &&
        n_tokens >= 128 &&
        cparams.attn_max_batch > 0 &&
        model.split_mode != LLAMA_SPLIT_MODE_GRAPH &&
        model.layers[0].wkv_b) {
        // In this case we perform the attention computation iteratively, and this adds
        // 10 nodes per layer per iteration. Although in many cases the 65536 nodes we
        // estimate by default are enough to accomodate, to be safe we add the additional
        // number of nodes required for the iterative MLA evaluation.
        int n_head = model.hparams.n_head();
        auto wkv_b = model.layers[0].wkv_b;
        auto kv_f32_size = wkv_b->ne[1] * n_kv * sizeof(float) / (1024*1024);
        if (cparams.attn_max_batch < kv_f32_size) {
            int n_max_head = 1;
            for (int niter = 2; niter < n_head; ++niter) {
                if (n_head % niter == 0 && kv_f32_size/niter <= cparams.attn_max_batch) {
                    n_max_head = n_head/niter;
                    break;
                }
            }
            GGML_ASSERT(n_head % n_max_head == 0);
            int n_iter = n_head / n_max_head;
            max_nodes += (n_iter - 1) * 10 * model.hparams.n_layer;
        }
    }
    return max_nodes;
}

//
// kv cache helpers
//

static inline bool llama_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) {
    return hparams.is_recurrent(il);
}

static inline uint32_t llama_kv_v_row_embd(
        const llama_model &   model,
        const llama_hparams & hparams,
        uint32_t              il) {
    // qwen3next recurrent state is stored in a dedicated V-cache tail (per sequence),
    // so per-token V rows include only attention values.
    if (llm_arch_is_hybrid(model.arch)) {
        return hparams.n_embd_v_gqa(il);
    }

    return hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
}

static inline uint32_t llama_qwen3next_state_slots(const llama_cparams & cparams, uint32_t kv_size) {
    return std::min<uint32_t>(std::max<uint32_t>(1, cparams.n_seq_max), kv_size);
}

static inline uint32_t llama_kv_qnext_state_slots(const llama_kv_cache & cache) {
    uint32_t n_slots = 0;

    for (const ggml_tensor * t : cache.s_l) {
        if (t == nullptr) {
            continue;
        }

        const uint32_t layer_slots = (uint32_t) t->ne[1];
        if (n_slots == 0) {
            n_slots = layer_slots;
        } else {
            GGML_ASSERT(n_slots == layer_slots);
        }
    }

    return n_slots;
}

static inline bool llama_kv_has_qnext_state_storage(const llama_kv_cache & cache) {
    return llama_kv_qnext_state_slots(cache) > 0;
}

static inline bool llama_kv_qnext_seq_id_in_range(const llama_kv_cache & cache, llama_seq_id seq_id) {
    const uint32_t n_slots = llama_kv_qnext_state_slots(cache);
    return n_slots > 0 && seq_id >= 0 && (uint32_t) seq_id < n_slots;
}

static bool llama_kv_cache_init(
             struct llama_kv_cache & cache,
               const llama_context * ctx,
                         ggml_type   type_k,
                         ggml_type   type_v,
                          uint32_t   kv_size,
                              bool   offload,
                          ggml_type  type_k_first,
                          ggml_type  type_k_last,
                          ggml_type  type_v_first,
                          ggml_type  type_v_last,
                          int32_t    n_k_first,
                          int32_t    n_k_last,
                          int32_t    n_v_first,
                          int32_t    n_v_last) {
    const llama_model & model = ctx->model;
    const llama_cparams & cparams = ctx->cparams;

    const struct llama_hparams & hparams = model.hparams;

    const int64_t  n_layer = model.mtp ? hparams.n_layer
                                       : hparams.n_layer - hparams.nextn_predict_layers;

    cache.has_shift = false;

    // TODO: find a nicer way to add other recurrent model architectures
    cache.recurrent = llm_arch_is_recurrent(model.arch);
    cache.hybrid = llm_arch_is_hybrid(model.arch);
    // qwen3next uses hybrid recurrent+attention cache semantics. Keep V rows in
    // standard layout to match the mainline hybrid path when flash attention is off.
    cache.v_trans   = !cache.recurrent && !cparams.flash_attn && !llm_arch_is_hybrid(model.arch);

    cache.head = 0;
    cache.size = kv_size;
    cache.used = 0;

    cache.type_k  = type_k;
    cache.type_v  = type_v;

    cache.cells.clear();
    cache.cells.resize(kv_size);

    if (cache.recurrent || llm_arch_is_hybrid(model.arch)) {
        // init state copy sources
        for (uint32_t i = 0; i < cache.size; ++i) {
            cache.cells[i].src = i;
        }
    }

    bool is_mla_attn = model.is_mla_model();

    bool split_cache   = false;
    bool replicate_mla = false;
    if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && !is_mla_attn && offload) {
        cache.split_k_l.reserve(n_layer);
        cache.split_v_l.reserve(n_layer);
        if (llama_model_has_recurrent(&model)) {
            cache.split_s_l.reserve(n_layer);
        }
        split_cache = true;
    }
    if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && is_mla_attn && offload) {
        cache.replicated_k_l.reserve(n_layer);
        replicate_mla = true;
    }

    // count used buffer types
    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
    if (offload) {
        const bool is_mtp = (model.arch == LLM_ARCH_GLM_DSA ||
                             model.arch == LLM_ARCH_QWEN35 ||
                             model.arch == LLM_ARCH_QWEN35MOE) && hparams.nextn_predict_layers > 0;
        const int64_t n_mtp_first = hparams.n_layer - hparams.nextn_predict_layers;
        for (int64_t i = 0; i < n_layer; ++i) {
            const bool is_mtp_tail = is_mtp && i >= n_mtp_first;
            if ((split_cache || replicate_mla) && !is_mtp_tail) {
                buft_layer_count[model.buft_layer[i].buft_matrix]++;
                if (model.buft_layer[i].buft != model.buft_layer[i].buft_matrix) {
                    buft_layer_count[model.buft_layer[i].buft]++;
                }
            } else {
                buft_layer_count[model.buft_layer[i].buft]++;
            }
        }
    } else {
        buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
    }

    // create a context for each buffer type
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    for (auto & it : buft_layer_count) {
        int n_layers = it.second;
        size_t ctx_mem_size = 8u*n_layers*ggml_tensor_overhead();
        if (split_cache || replicate_mla) ctx_mem_size += 4*model.splits.size()*n_layers*ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ ctx_mem_size,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ggml_context * ctx = ggml_init(params);
        if (!ctx) {
            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
            return false;
        }
        ctx_map[it.first] = ctx;
        cache.ctxs.push_back(ctx);
    }

    if (is_mla_attn) {
        int n_have_wkv_b = 0;
        for (auto& l : model.layers) {
            if (l.wkv_b || l.wk_b_pp) {
                ++n_have_wkv_b;
            }
        }
        bool have_wkv_b = n_have_wkv_b > 0;
        if (!have_wkv_b) {
            if (cparams.mla_attn != 1) {
                LLAMA_LOG_WARN("=========================================================\n");
                LLAMA_LOG_WARN("%s: missing wkv_b tensor(s)\n", __func__);
                LLAMA_LOG_WARN("%s: changing MLA from %d to 1\n", __func__, cparams.mla_attn);
                if (cparams.mla_attn > 1) {
                    LLAMA_LOG_WARN("%s: ** Prompt processing performance will be crippled **\n", __func__);
                }
                LLAMA_LOG_WARN("=========================================================\n");
                // Sorry for the hack.
                auto& non_cparams = const_cast<llama_cparams&>(cparams);
                non_cparams.mla_attn = 1;
            }
        }
    }

    bool needs_v_cache = true;
    cache.k_l.reserve(n_layer);
    if (is_mla_attn && cparams.mla_attn) {
        needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn;
    }
    if (needs_v_cache) cache.v_l.reserve(n_layer);
    cache.s_l.resize(n_layer, nullptr);

    std::vector<size_t> mem_split(model.splits.size(), 0);

    const uint32_t qnext_state_slots = llama_qwen3next_state_slots(cparams, kv_size);
    if (llm_arch_is_hybrid(model.arch) && qnext_state_slots < std::max<uint32_t>(1, cparams.n_seq_max)) {
        LLAMA_LOG_WARN("%s: reducing qwen3next state slots from %u to %u to fit KV cache size\n",
                __func__, std::max<uint32_t>(1, cparams.n_seq_max), qnext_state_slots);
    }

    int n_mla = 0;
    int n_kv_active_layers = 0;
    const int64_t n_mtp_first_layer = hparams.n_layer - hparams.nextn_predict_layers;
    for (int i = 0; i < (int) n_layer; i++) {
        // For MTP-only context, skip KV allocation for non-MTP layers
        if (cparams.mtp_op_type != MTP_OP_NONE && i < (int)n_mtp_first_layer) {
            cache.k_l.push_back(nullptr);
            if (!is_mla_attn || !cparams.mla_attn || (cparams.mla_attn == 1 && !cparams.flash_attn)) {
                cache.v_l.push_back(nullptr);
            }
            continue;
        }
        n_kv_active_layers++;
        const bool qnext_recurrent = llama_is_recurrent_layer(hparams, i);
        const uint32_t n_embd_v_row = llama_kv_v_row_embd(model, hparams, i);
        const uint32_t n_head_kv    = hparams.n_head_kv(i);
        const uint32_t n_embd_head_k= hparams.n_embd_head_k(i);

        const bool is_mtp_tail_layer = (model.arch == LLM_ARCH_QWEN35 ||
                                        model.arch == LLM_ARCH_QWEN35MOE ||
                                        model.arch == LLM_ARCH_GLM_DSA) &&
                hparams.nextn_predict_layers > 0 && i >= (int)n_mtp_first_layer;
        //struct ggml_context * ctx = split_cache && !qnext_recurrent ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
        struct ggml_context * ctx = ((split_cache || replicate_mla) && !is_mtp_tail_layer) ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
        ggml_tensor * k = nullptr;
        ggml_tensor * v = nullptr;
        ggml_tensor * s = nullptr;
        if (is_mla_attn && cparams.mla_attn) {
            // DeepSeek MLA
            const uint32_t n_embd_head_qk_rope = hparams.n_rot;
            const uint32_t kv_lora_rank = hparams.n_lora_kv;
            //LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank);
            ggml_type primary_kv_type = cparams.flash_attn ? cache.type_k
                                       : (cparams.mla_attn == 1 ? cache.type_k : cache.type_v);
            ggml_tensor * kv = ggml_new_tensor_2d(ctx, primary_kv_type, kv_lora_rank + n_embd_head_qk_rope, kv_size);
            ggml_format_name(kv, "cache_k_l%d", i);
            cache.k_l.push_back(kv);
            if (!cparams.flash_attn && cparams.mla_attn == 1) {
                ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_v, kv_lora_rank*kv_size);
                ggml_format_name(kvt, "cache_v_l%d", i);
                cache.v_l.push_back(kvt);
            }
            // Per-device replicas of the compressed latent KV cache (n_device from wo's split).
            if (replicate_mla && !is_mtp_tail_layer && model.layers[i].wo && model.layers[i].wo->extra) {
                auto wo = model.layers[i].wo;
                auto extra_wo = (const ggml_split_tensor_t *)wo->extra;
                int n_device = extra_wo->n_device;
                auto & repl_k_l = cache.replicated_k_l.emplace_back();
                repl_k_l.tensor_splits.resize(n_device, nullptr);
                for (int is = 0; is < n_device; ++is) {
                    if (!extra_wo->splits[is]) continue;
                    ggml_tensor * rkv = ggml_new_tensor_2d(ctx, primary_kv_type,
                            kv_lora_rank + n_embd_head_qk_rope, kv_size);
                    auto split_name = std::string("cache_k_l") + std::to_string(i) + '.' + std::to_string(is);
                    ggml_set_name(rkv, split_name.c_str());
                    repl_k_l.tensor_splits[is] = rkv;
                    mem_split[is] += ggml_nbytes(rkv);
                }
                repl_k_l.ggml.n_device  = n_device;
                repl_k_l.ggml.split_dim = -1;
                repl_k_l.ggml.splits    = repl_k_l.tensor_splits.data();
                kv->extra = (void *)&repl_k_l.ggml;
            }
            n_mla++;
        }
        else {
            // Qwen 3 MTP layer always need KV cache in the MTP context
            const bool is_mtp_layer = (cparams.mtp_op_type != MTP_OP_NONE && i >= (int)n_mtp_first_layer);
            if (!hparams.has_kv(i) && !is_mtp_layer) {
                cache.k_l.push_back(nullptr);
                cache.v_l.push_back(nullptr);
                continue;
            }
            if (qnext_recurrent) {
                s = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hparams.n_embd_v_s(), qnext_state_slots);
                auto s_name = std::string{"cache_s_l"} + std::to_string(i);
                ggml_set_name(s, s_name.c_str());
                cache.s_l[i] = s;
                cache.k_l.push_back(nullptr);
                cache.v_l.push_back(nullptr);
                LLAMA_LOG_DEBUG("=== Created recurrent cache %s as %ld x %ld x %ld x %ld\n", s->name, s->ne[0], s->ne[1], s->ne[2], s->ne[3]);
                if (split_cache && model.layers[i].ssm_out->extra) {
                    auto split_ssm_out = (const ggml_split_tensor_t *)model.layers[i].ssm_out->extra;
                    GGML_ASSERT(split_ssm_out);
                    int num_v_heads = hparams.ssm_dt_rank;
                    int head_v_dim  = hparams.ssm_d_inner / num_v_heads;
                    int n_device = split_ssm_out->n_device;
                    auto & split_s_l = cache.split_s_l.emplace_back();
                    split_s_l.tensor_splits.resize(n_device, nullptr);
                    for (int is = 0; is < n_device; ++is) {
                        auto split = split_ssm_out->splits[is];
                        if (!split) continue;
                        GGML_ASSERT(split->ne[0] % head_v_dim == 0);
                        int nv = split->ne[0] / head_v_dim;
                        auto size = hparams.n_embd_v_s_id(nv);
                        split_s_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, size, qnext_state_slots);
                        auto split_name = s_name + '.' + std::to_string(is);
                        ggml_set_name(split_s_l.tensor_splits[is], split_name.c_str());
                        mem_split[is] += ggml_nbytes(split_s_l.tensor_splits[is]);
                    }
                    split_s_l.ggml.n_device  = n_device;
                    split_s_l.ggml.split_dim = 0;
                    split_s_l.ggml.splits    = split_s_l.tensor_splits.data();
                    cache.s_l[i]->extra = (void *)&split_s_l.ggml;
                }
                continue;
            }
            bool split_cache_i = split_cache;
            auto K = model.layers[i].wk;
            auto V = model.layers[i].wv;
            if (!V && model.arch == LLM_ARCH_GEMMA4) {
                V = K;
            }
            if (split_cache && (!K || !V || !K->extra || !V->extra)) {
                ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
                split_cache_i = false;
            }
            int n_embd_head_v = hparams.n_embd_head_v(i);
            auto this_type_k = type_k;
            if (type_k_first != type_k && n_k_first > 0 && i < n_k_first) {
                this_type_k = type_k_first;
            }
            if (type_k_last != type_k && n_k_last > 0 && i >= n_layer - n_k_last) {
                this_type_k = type_k_last;
            }
            if (this_type_k != type_k) {
                LLAMA_LOG_INFO("================= Setting K-cache type in layer %2d to %s\n", i, ggml_type_name(this_type_k));
            }
            k = ggml_new_tensor_2d(ctx, this_type_k, n_embd_head_k, n_head_kv*kv_size);

            int64_t v_ne = int64_t(n_embd_v_row)*kv_size;
            auto this_type_v = type_v;
            if (type_v_first != type_v && n_v_first > 0 && i < n_v_first) {
                this_type_v = type_v_first;
            }
            if (type_v_last != type_v && n_v_last > 0 && i >= n_layer - n_v_last) {
                this_type_v = type_v_last;
            }
            if (this_type_v != type_v) {
                LLAMA_LOG_INFO("================= Setting V-cache type in layer %2d to %s\n", i, ggml_type_name(this_type_v));
            }
            v = ggml_new_tensor_1d(ctx, this_type_v, v_ne);

            auto k_name = std::string{"cache_k_l"} + std::to_string(i);
            auto v_name = std::string{"cache_v_l"} + std::to_string(i);
            ggml_set_name(k, k_name.c_str());
            ggml_set_name(v, v_name.c_str());

            if (split_cache_i) {
                bool use_V_for_K = model.layers[i].attn_k_norm && model.layers[i].attn_k_norm->ne[0] == K->ne[1] ? true : false;
                auto extra_K = (const ggml_split_tensor_t *)K->extra;
                auto extra_V = (const ggml_split_tensor_t *)V->extra;
                auto & split_k_l = cache.split_k_l.emplace_back();
                auto & split_v_l = cache.split_v_l.emplace_back();
                split_k_l.tensor_splits.resize(extra_K->n_device, nullptr);
                split_v_l.tensor_splits.resize(extra_V->n_device, nullptr);
                for (int is = 0; is < extra_K->n_device; ++is) {
                    auto split = use_V_for_K ? extra_V->splits[is] : extra_K->splits[is];
                    if (!split) continue;
                    int nhead_kv = use_V_for_K ? split->ne[1] / n_embd_head_v : split->ne[1]/n_embd_head_k;
                    if (use_V_for_K) {
                        LLAMA_LOG_DEBUG("K_cache(%d, %d): using %d instead of %ld heads\n",
                                i, is, nhead_kv, extra_K->splits[is]->ne[1]/n_embd_head_k);
                    }
                    split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, this_type_k, n_embd_head_k, nhead_kv * kv_size);
                    auto split_name = k_name + '.' + std::to_string(is);
                    ggml_set_name(split_k_l.tensor_splits[is], split_name.c_str());
                    mem_split[is] += ggml_nbytes(split_k_l.tensor_splits[is]);
                }
                split_k_l.ggml.n_device  = extra_K->n_device;
                split_k_l.ggml.split_dim = 0;
                split_k_l.ggml.splits    = split_k_l.tensor_splits.data();
                for (int is = 0; is < extra_V->n_device; ++is) {
                    auto split = extra_V->splits[is];
                    if (!split) continue;
                    split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, this_type_v, split->ne[1] * kv_size);
                    auto split_name = v_name + '.' + std::to_string(is);
                    ggml_set_name(split_v_l.tensor_splits[is], split_name.c_str());
                    mem_split[is] += ggml_nbytes(split_v_l.tensor_splits[is]);
                }
                split_v_l.ggml.n_device  = extra_V->n_device;
                split_v_l.ggml.split_dim = 0;
                split_v_l.ggml.splits    = split_v_l.tensor_splits.data();
                k->extra = (void *)&split_k_l.ggml;
                v->extra = (void *)&split_v_l.ggml;
            }
            cache.k_l.push_back(k);
            cache.v_l.push_back(v);
        }
    }
    if (is_mla_attn && cparams.mla_attn && n_mla < n_kv_active_layers && n_mla > 0) {
        LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d active KV layers having MLA enabled\n", __func__, n_mla, n_kv_active_layers);
        LLAMA_LOG_ERROR("%s: bailing out\n", __func__);
        GGML_ABORT("fatal error");
    }

    // allocate tensors and initialize the buffers to avoid NaNs in the padding
    for (auto it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
        ggml_context * ctx = it.second;
        int ntensor = 0;
        for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
            ++ntensor;
        }
        if (ntensor > 0) {
            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
            if (!buf) {
                LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
                return false;
            }
            ggml_backend_buffer_clear(buf, 0);
            LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
            cache.bufs.push_back(buf);
        }
    }
    if (split_cache || replicate_mla) {
        LLAMA_LOG_INFO("%s: KV cache size per device%s:\n", __func__,
                       replicate_mla ? " (MLA replicated)" : "");
        for (int i = 0; i < int(mem_split.size()); ++i) LLAMA_LOG_INFO("    Device %d:  %g MiB\n", i, mem_split[i]/1024./1024.);
    }

#if 0
    for (int il = 0; il < n_layer; ++il) {
        if (cache.k_l[il]->extra) {
            printf("Layer %2d, K-buffer: %p:", il, (void *)cache.k_l[il]->buffer);
            auto split_kl = (ggml_split_tensor_t *)cache.k_l[il]->extra;
            for (int id = 0; id < split_kl->n_device; ++id) {
                if (split_kl->splits[id]) printf(" %p,%p", (void *)split_kl->splits[id]->data, (void *)split_kl->splits[id]->buffer);
            }
            printf("\n");
        }
        if (cache.v_l[il]->extra) {
            printf("Layer %2d, V-buffer: %p:", il, (void *)cache.v_l[il]->buffer);
            auto split_vl = (ggml_split_tensor_t *)cache.v_l[il]->extra;
            for (int id = 0; id < split_vl->n_device; ++id) {
                if (split_vl->splits[id]) printf(" %p,%p", (void *)split_vl->splits[id]->data, (void *)split_vl->splits[id]->buffer);
            }
            printf("\n");
        }
    }
#endif

    return true;
}

// find an empty slot of size "n_tokens" in the cache
// updates the cache head
// Note: On success, it's important that cache.head points
// to the first cell of the slot.
static bool llama_kv_cache_find_slot(
           struct llama_kv_cache & cache,
        const struct llama_batch & batch) {
    const uint32_t n_tokens = batch.n_tokens;

    if (cache.recurrent) {
        // For recurrent state architectures (like Mamba),
        // each KV cache cell can store the state for a whole sequence.

        llama_seq_id min = cache.size - 1;
        llama_seq_id max = 0;

        for (uint32_t i = 0; i < n_tokens; ++i) {
            for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) {
                llama_seq_id seq_id = batch.seq_id[i][j];
                // make sure it's a valid seq_id
                if ((uint32_t) seq_id < cache.size) {
                    if (seq_id > max) {
                        max = seq_id;
                    }
                    if (seq_id < min) {
                        min = seq_id;
                    }
                    // Assuming the tokens are in-order
                    if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
                        // What should happen when the pos backtracks or skips a value?
                        // Clearing the state mid-batch would require special-casing which isn't done.
                        LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
                            __func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
                    }
                    if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) {
                        cache.used += 1;
                    }
                    cache.cells[seq_id].pos = batch.pos[i];
                    // NOTE: seq_ids are not inserted here; they are handled when the input tensors are set
                } else {
                    // too big seq_id
                    // TODO: would it be possible to resize the KV cache size instead?
                    LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
                    return false;
                }
            }
        }

        // allow getting the range of used cells, from head to head + n
        cache.head = min;
        cache.n    = max - min + 1;

        // sanity check
        return max >= min;
    }
    // otherwise, one cell per token.

    if (n_tokens > cache.size) {
        LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
        return false;
    }

    uint32_t n_tested = 0;

    while (true) {
        if (cache.head + n_tokens > cache.size) {
            n_tested += cache.size - cache.head;
            cache.head = 0;
            continue;
        }

        bool found = true;
        for (uint32_t i = 0; i < n_tokens; i++) {
            if (cache.cells[cache.head + i].pos >= 0) {
                found = false;
                cache.head += i + 1;
                n_tested   += i + 1;
                break;
            }
        }

        if (found) {
            break;
        }

        if (n_tested >= cache.size) {
            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
            return false;
        }
    }

    for (uint32_t i = 0; i < n_tokens; i++) {
        cache.cells[cache.head + i].pos = batch.pos[i];

        for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
            cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
        }
    }

    cache.used += n_tokens;

    return true;
}

// find how many cells are currently in use
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
    for (uint32_t i = cache.size; i > 0; --i) {
        const llama_kv_cell & cell = cache.cells[i - 1];

        if (cell.pos >= 0 && !cell.is_empty()) {
            return i;
        }
    }

    return 0;
}

static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache, uint32_t pad) {
    uint32_t last = cache.size;
    for (uint32_t i = cache.size; i > 0; i -= pad) {
        const llama_kv_cell & cell = cache.cells[i - 1];

        if (cell.pos >= 0 && !cell.is_empty()) {
            for (uint32_t j = last; j > i; --j) {
                auto & cell_j = cache.cells[j - 1];
                if (cell_j.pos >= 0 && !cell_j.is_empty()) return j;
            }
            return i;
        }
        last = i;
    }

    return 0;
}

bool llama_kv_cache::checkpoint_supported() const {
    for (const auto * s : s_l) {
        if (s != nullptr) {
            return true;
        }
    }
    return false;
}

bool llama_kv_cache::checkpoint_alloc_shadows(bool conv_only_shadow) {
    if (ckpt.allocated) {
        if (ckpt.shadow_conv_only != conv_only_shadow) {
            LLAMA_LOG_ERROR("%s: requested %s shadow buffers, but %s shadow buffers are already allocated\n",
                    __func__,
                    conv_only_shadow ? "conv-state-only" : "full-state",
                    ckpt.shadow_conv_only ? "conv-state-only" : "full-state");
            return false;
        }
        return true;
    }

    const uint32_t n_layer = (uint32_t)s_l.size();
    ckpt.s_l_shadow.resize(n_layer, nullptr);

    struct tensor_entry {
        ggml_tensor * primary;
        uint32_t      il;
        int           split_idx; // -1 for non-split
    };

    std::map<ggml_backend_buffer_type_t, std::vector<tensor_entry>> buft_entries;

    for (uint32_t il = 0; il < n_layer; ++il) {
        if (s_l[il] == nullptr) {
            continue;
        }

        auto * extra = s_l[il]->extra;
        if (extra != nullptr) {
            auto * split_info = (const ggml_split_tensor_t *)extra;
            for (int d = 0; d < split_info->n_device; ++d) {
                if (split_info->splits[d] == nullptr) continue;
                ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(split_info->splits[d]->buffer);
                buft_entries[buft].push_back({split_info->splits[d], il, d});
            }
        } else {
            ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(s_l[il]->buffer);
            buft_entries[buft].push_back({s_l[il], il, -1});
        }
    }

    // Allocate all shadows on the same backend type as the source tensor.
    for (auto & [buft, entries] : buft_entries) {
        ggml_init_params params = {
            /*.mem_size   =*/ entries.size() * ggml_tensor_overhead(),
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ggml_context * ctx = ggml_init(params);
        if (!ctx) {
            LLAMA_LOG_ERROR("%s: failed to create ggml context for shadow tensors\n", __func__);
            return false;
        }

        for (auto & entry : entries) {
            ggml_tensor * shadow = nullptr;
            if (conv_only_shadow && entry.split_idx < 0) {
                shadow = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ckpt.per_step_conv_state_dim);
            } else {
                shadow = ggml_dup_tensor(ctx, entry.primary);
            }
            if (entry.split_idx >= 0) {
                ggml_format_name(shadow, "shadow_s_l%d_d%d", entry.il, entry.split_idx);
            } else {
                ggml_format_name(shadow, "shadow_s_l%d", entry.il);
            }
            entry.primary = shadow;
        }

        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
        if (!buf) {
            LLAMA_LOG_ERROR("%s: failed to allocate buffer for shadow tensors\n", __func__);
            ggml_free(ctx);
            return false;
        }
        ggml_backend_buffer_clear(buf, 0);
        LLAMA_LOG_INFO("%s: %10s shadow buffer = %8.2f MiB%s\n", __func__,
                   ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0,
                   conv_only_shadow ? " (conv-state only)" : "");
        ckpt.shadow_ctxs.push_back(ctx);
        ckpt.shadow_bufs.push_back(buf);

        for (const auto & entry : entries) {
            if (entry.split_idx < 0) {
                ckpt.s_l_shadow[entry.il] = entry.primary;
            }
        }
    }

    // Build split shadow lookup
    ckpt.split_s_l_shadow.resize(n_layer);
    for (uint32_t il = 0; il < n_layer; ++il) {
        if (s_l[il] == nullptr || s_l[il]->extra == nullptr) {
            continue;
        }

        auto * split_info = (const ggml_split_tensor_t *)s_l[il]->extra;
        auto & shadow_split = ckpt.split_s_l_shadow[il];
        shadow_split.resize(split_info->n_device, nullptr);

        for (int d = 0; d < split_info->n_device; ++d) {
            if (split_info->splits[d] == nullptr) continue;
            ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(split_info->splits[d]->buffer);
            for (auto & entry : buft_entries[buft]) {
                if (entry.il == il && entry.split_idx == d) {
                    shadow_split[d] = entry.primary;
                    break;
                }
            }
        }
    }

    ckpt.shadow_conv_only = conv_only_shadow;
    ckpt.allocated = true;
    return true;
}

bool llama_kv_cache::checkpoint_save(ggml_backend_sched_t sched) {
    if (!checkpoint_alloc_shadows(false)) {
        return false;
    }

    GGML_ASSERT(!ckpt.shadow_conv_only);

    const uint32_t n_layer = (uint32_t)s_l.size();

    ckpt.cells_snapshot = cells;
    ckpt.head_snapshot  = head;
    ckpt.used_snapshot  = used;

    std::unordered_set<ggml_backend_t> backends_to_sync;

    for (uint32_t il = 0; il < n_layer; ++il) {
        if (s_l[il] == nullptr) {
            continue;
        }

        if (s_l[il]->extra != nullptr) {
            auto * split_info = (const ggml_split_tensor_t *)s_l[il]->extra;
            auto & shadow_split = ckpt.split_s_l_shadow[il];
            for (int d = 0; d < split_info->n_device; ++d) {
                if (split_info->splits[d] && shadow_split[d]) {
                    //ggml_backend_tensor_copy(split_info->splits[d], shadow_split[d]);
                    auto src_backend = ggml_backend_sched_get_tensor_backend(sched, split_info->splits[d]);
                    ggml_backend_tensor_copy_async(src_backend, src_backend, split_info->splits[d], shadow_split[d]);
                    backends_to_sync.insert(src_backend);
                }
            }
        } else {
            GGML_ASSERT(ckpt.s_l_shadow[il] != nullptr);
            auto src_backend = ggml_backend_sched_get_tensor_backend(sched, s_l[il]);
            GGML_ASSERT(src_backend != nullptr);
            ggml_backend_tensor_copy_async(src_backend, src_backend, s_l[il], ckpt.s_l_shadow[il]);
            backends_to_sync.insert(src_backend);
        }
    }

    for (auto backend : backends_to_sync) {
        ggml_backend_synchronize(backend);
    }

    ckpt.saved = true;
    return true;
}

bool llama_kv_cache::checkpoint_restore(ggml_backend_sched_t sched) {
    if (!ckpt.saved) {
        LLAMA_LOG_ERROR("%s: no checkpoint saved\n", __func__);
        return false;
    }

    GGML_ASSERT(!ckpt.shadow_conv_only);

    const uint32_t n_layer = (uint32_t)s_l.size();

    cells = ckpt.cells_snapshot;
    head  = ckpt.head_snapshot;
    used  = ckpt.used_snapshot;

    std::unordered_set<ggml_backend_t> backends_to_sync;

    for (uint32_t il = 0; il < n_layer; ++il) {
        if (s_l[il] == nullptr) {
            continue;
        }

        if (s_l[il]->extra != nullptr) {
            auto * split_info = (const ggml_split_tensor_t *)s_l[il]->extra;
            auto & shadow_split = ckpt.split_s_l_shadow[il];
            for (int d = 0; d < split_info->n_device; ++d) {
                if (split_info->splits[d] && shadow_split[d]) {
                    auto dst_backend = ggml_backend_sched_get_tensor_backend(sched, split_info->splits[d]);
                    ggml_backend_tensor_copy_async(dst_backend, dst_backend, shadow_split[d], split_info->splits[d]);
                    backends_to_sync.insert(dst_backend);
                }
            }
        } else {
            GGML_ASSERT(ckpt.s_l_shadow[il] != nullptr);
            GGML_ASSERT(ggml_nbytes(ckpt.s_l_shadow[il]) == ggml_nbytes(s_l[il]));
            auto dst_backend = ggml_backend_sched_get_tensor_backend(sched, s_l[il]);
            GGML_ASSERT(dst_backend != nullptr);
            ggml_backend_tensor_copy_async(dst_backend, dst_backend, ckpt.s_l_shadow[il], s_l[il]);
            backends_to_sync.insert(dst_backend);
        }
    }

    for (auto backend : backends_to_sync) {
        ggml_backend_synchronize(backend);
    }

    return true;
}

void llama_kv_cache::checkpoint_delete() {
    ckpt.saved = false;
}

bool llama_kv_cache::per_step_alloc(const llama_model & model, int max_tokens) {
    if (ckpt.per_step_max_allocated >= max_tokens) {
        return true;
    }

    if (!ckpt.per_step_ssm.empty()) {
        for (struct ggml_context * ctx : ckpt.per_step_ctxs) {
            ggml_free(ctx);
        }
        for (ggml_backend_buffer_t buf : ckpt.per_step_bufs) {
            ggml_backend_buffer_free(buf);
        }
        ckpt.per_step_ctxs.clear();
        ckpt.per_step_bufs.clear();
        ckpt.per_step_ssm.clear();
        ckpt.per_step_conv.clear();
        ckpt.per_step_max_allocated = 0;
    }

    const uint32_t n_layer = (uint32_t)s_l.size();
    ckpt.per_step_ssm.resize(n_layer);
    ckpt.per_step_conv.resize(n_layer);

    const int64_t ssm_state_dim  = ckpt.per_step_ssm_state_size;
    const int64_t conv_dim       = ckpt.per_step_conv_dim;
    const int64_t conv_state_dim = ckpt.per_step_conv_state_dim;
    if (ssm_state_dim <= 0 || conv_dim <= 0) {
        LLAMA_LOG_ERROR("%s: per_step dimensions not set (ssm=%lld, conv_dim=%lld)\n",
                __func__, (long long)ssm_state_dim, (long long)conv_dim);
        return false;
    }

    int num_v_heads = model.hparams.ssm_dt_rank;
    int head_v_dim  = model.hparams.ssm_d_inner / num_v_heads;
    int d_conv      = model.hparams.ssm_d_conv;

    std::map<ggml_backend_buffer_type_t, std::vector<std::pair<std::pair<uint32_t, int32_t>, ggml_backend_buffer_type_t>>> buft_layers;

    for (uint32_t il = 0; il < n_layer; ++il) {
        if (s_l[il] == nullptr) continue;

        if (s_l[il]->extra) {
            auto split_sl = (ggml_split_tensor_t *)s_l[il]->extra;
            for (int id = 0; id < split_sl->n_device; ++id) {
                if (!split_sl->splits[id]) continue;
                auto buft = ggml_backend_buffer_get_type(split_sl->splits[id]->buffer);
                GGML_ASSERT(buft != nullptr);
                buft_layers[buft].push_back({{il, id}, buft});
            }
        } else {
            ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(s_l[il]->buffer);
            buft_layers[buft].push_back({{il, -1}, buft});
        }
    }

    for (auto & [buft, layers] : buft_layers) {
        // 2 tensors per layer: SSM states + qkv features
        ggml_init_params params = {
            /*.mem_size   =*/ layers.size() * 2 * ggml_tensor_overhead(),
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ggml_context * ctx = ggml_init(params);
        if (!ctx) {
            LLAMA_LOG_ERROR("%s: failed to create ggml context for per-step checkpoints\n", __func__);
            return false;
        }

        for (auto & [p, bt] : layers) {
            auto [il, id] = p;
            // SSM state: max_tokens * ssm_state_dim
            if (id < 0) {
                if (max_tokens > 1) {
                    GGML_ASSERT(ckpt.per_step_ssm[il].empty());
                    GGML_ASSERT(ckpt.per_step_conv[il].empty());
                    ggml_tensor * t_ssm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (int64_t)(max_tokens - 1) * ssm_state_dim);
                    ggml_format_name(t_ssm, "per_step_ssm_l%d", il);
                    ckpt.per_step_ssm[il].push_back(t_ssm);
                }

                // Conv features (qkv_mixed): max_tokens * conv_dim
                ggml_tensor * t_qkv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (int64_t)max_tokens * conv_state_dim);
                ggml_format_name(t_qkv, "per_step_qkv_l%d", il);
                ckpt.per_step_conv[il].push_back(t_qkv);
            } else {
                auto split_sl = (ggml_split_tensor_t *)s_l[il]->extra;
                auto split_ssm_out = (const ggml_split_tensor_t *)model.layers[il].ssm_out->extra;
                GGML_ASSERT(split_ssm_out && split_ssm_out->splits[id]);
                auto split = split_ssm_out->splits[id];
                GGML_ASSERT(split->ne[0] % head_v_dim == 0);
                if (ckpt.per_step_ssm[il].empty()) {
                    ckpt.per_step_ssm [il].resize(split_sl->n_device, nullptr);
                    ckpt.per_step_conv[il].resize(split_sl->n_device, nullptr);
                } else {
                    GGML_ASSERT(int(ckpt.per_step_ssm[il].size()) == split_sl->n_device);
                    GGML_ASSERT(int(ckpt.per_step_conv[il].size()) == split_sl->n_device);
                    GGML_ASSERT(ckpt.per_step_ssm[il][id] == nullptr);
                    GGML_ASSERT(ckpt.per_step_conv[il][id] == nullptr);
                }
                int nv = split->ne[0] / head_v_dim; // number of heads handled by this device
                auto [this_conv_dim, this_ssm_dim] = model.hparams.n_embd_v_s_dims(nv);

                if (max_tokens > 1) {
                    auto t_ssm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (int64_t)(max_tokens - 1) * this_ssm_dim);
                    ggml_format_name(t_ssm, "per_step_ssm_l%d_%d", il, id);
                    ckpt.per_step_ssm[il][id] = t_ssm;
                }

                auto t_qkv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (int64_t)max_tokens * this_conv_dim * (d_conv - 1));
                ggml_format_name(t_qkv, "per_step_qkv_l%d_%d", il, id);
                ckpt.per_step_conv[il][id] = t_qkv;
            }
        }

        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
        if (!buf) {
            LLAMA_LOG_ERROR("%s: failed to allocate buffer for per-step checkpoints\n", __func__);
            ggml_free(ctx);
            return false;
        }
        ggml_backend_buffer_clear(buf, 0);
        LLAMA_LOG_INFO("%s: %10s per-step buffer = %8.2f MiB (max_tokens=%d)\n", __func__,
                       ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0, max_tokens);
        ckpt.per_step_ctxs.push_back(ctx);
        ckpt.per_step_bufs.push_back(buf);
    }

    ckpt.per_step_max_allocated = max_tokens;
    return true;
}

static void restore_recurrent_cache_tensors(int step, ggml_backend_sched_t sched,
        size_t ssm_bytes, size_t conv_bytes,
        ggml_tensor * s_l, ggml_tensor * per_step_ssm, ggml_tensor * per_step_conv,
        std::unordered_set<ggml_backend_t> & backends_to_sync) {
    auto dst_backend = ggml_backend_sched_get_tensor_backend(sched, s_l);
    auto dst = *s_l;
    dst.ne[0] = ssm_bytes/sizeof(float);
    dst.nb[1] = dst.nb[2] = dst.nb[3] = ssm_bytes + conv_bytes;
    dst.data  = (char *)s_l->data + conv_bytes;
    auto src = dst;
    src.data = (char *)per_step_ssm->data + (size_t)step * ssm_bytes;
    ggml_backend_tensor_copy_async(dst_backend, dst_backend, &src, &dst);
    backends_to_sync.insert(dst_backend);

    dst.data  = (char *)s_l->data;
    dst.ne[0] = conv_bytes/sizeof(float);
    src = dst;
    src.data = (char *)per_step_conv->data + (size_t)step * conv_bytes;
    ggml_backend_tensor_copy_async(dst_backend, dst_backend, &src, &dst);
}

bool llama_kv_cache::per_step_restore(const llama_model & model, ggml_backend_sched_t sched, int step) {
    if (ckpt.per_step_ssm.empty() || step < 0) {
        return false;
    }

    std::unordered_set<ggml_backend_t> backends_to_sync;

    const int64_t ssm_state_dim  = ckpt.per_step_ssm_state_size;
    const int64_t conv_state_dim = ckpt.per_step_conv_state_dim;
    const int64_t conv_dim       = ckpt.per_step_conv_dim;
    const int32_t d_conv         = ckpt.per_step_d_conv;
    if (ssm_state_dim <= 0 || conv_dim <= 0 || d_conv <= 1) return false;

    const int64_t ssm_bytes  = ssm_state_dim * sizeof(float);
    const int64_t conv_bytes = conv_state_dim * sizeof(float);

    int num_v_heads = model.hparams.ssm_dt_rank;
    int head_v_dim  = model.hparams.ssm_d_inner / num_v_heads;

    const uint32_t n_layer = (uint32_t)s_l.size();
    for (uint32_t il = 0; il < n_layer; ++il) {
        if (s_l[il] == nullptr || ckpt.per_step_ssm[il].empty()) continue;

        if (!s_l[il]->extra) {
            restore_recurrent_cache_tensors(step, sched, ssm_bytes, conv_bytes,
                s_l[il], ckpt.per_step_ssm[il].front(), ckpt.per_step_conv[il].front(),
                backends_to_sync);
        } else {
            auto split_sl = (const ggml_split_tensor_t *)s_l[il]->extra;
            for (int id = 0; id < split_sl->n_device; ++id) {
                if (!split_sl->splits[id]) continue;
                auto split_ssm_out = (const ggml_split_tensor_t *)model.layers[il].ssm_out->extra;
                GGML_ASSERT(split_ssm_out && split_ssm_out->splits[id]);
                auto split = split_ssm_out->splits[id];
                GGML_ASSERT(split->ne[0] % head_v_dim == 0);
                int nv = split->ne[0] / head_v_dim; // number of heads handled by this device
                auto [this_conv_dim, this_ssm_dim] = model.hparams.n_embd_v_s_dims(nv);
                auto this_conv_bytes = (d_conv - 1) * this_conv_dim * sizeof(float);
                restore_recurrent_cache_tensors(step, sched, this_ssm_dim * sizeof(float), this_conv_bytes,
                    split_sl->splits[id], ckpt.per_step_ssm[il][id], ckpt.per_step_conv[il][id],
                    backends_to_sync);
            }
        }
    }

    // TODO: do we need to synchronize here?
    for (auto backend : backends_to_sync) {
        ggml_backend_synchronize(backend);
    }

    return true;
}

static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
    for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
        cache.cells[i].pos = -1;
        cache.cells[i].src = i;
        cache.cells[i].seq_id.clear();
    }
    cache.head = 0;
    cache.used = 0;

    for (auto & buf : cache.bufs) {
        ggml_backend_buffer_clear(buf, 0);
    }
}

static bool llama_kv_cache_seq_rm(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1) {
    uint32_t new_head = cache.size;

    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();

    // models like Mamba can't have a state partially erased
    if (cache.recurrent) {
        if (seq_id >= (int64_t) cache.size) {
            // could be fatal
            return false;
        }
        if (0 <= seq_id) {
            // partial intersection is invalid
            if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) {
                return false;
            }
        } else {
            // seq_id is negative, then the range should include everything or nothing
            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
                return false;
            }
        }
    }

    const bool has_qnext_state = llama_kv_has_qnext_state_storage(cache);

    for (uint32_t i = 0; i < cache.size; ++i) {
        if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
            if (seq_id < 0) {
                cache.cells[i].seq_id.clear();
            } else if (cache.cells[i].has_seq_id(seq_id)) {
                cache.cells[i].seq_id.erase(seq_id);
            } else {
                continue;
            }
            if (cache.cells[i].is_empty()) {
                // keep count of the number of used cells
                if (cache.cells[i].pos >= 0) cache.used--;

                cache.cells[i].pos = -1;
                if (has_qnext_state) {
                    cache.cells[i].src = i;
                }
                if (new_head == cache.size) new_head = i;
            }
        }
    }

    // If we freed up a slot, set head to it so searching can start there.
    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;

    return true;
}

static void llama_kv_cache_seq_cp(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id_src,
                 llama_seq_id   seq_id_dst,
                    llama_pos   p0,
                    llama_pos   p1) {
    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();

    if (cache.recurrent) {
        if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
            seq_id_src = cache.cells[seq_id_src].src;
            GGML_ASSERT((uint32_t) seq_id_src < cache.size);
            // intent to "copy from"
            // supports copy chains thanks to taking the source of the source
            cache.cells[seq_id_dst].src = seq_id_src;

            // preserve the "keep or clear" status of the copied sequence
            if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) {
                cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
            } else {
                cache.cells[seq_id_dst].seq_id.erase(seq_id_dst);
            }

            cache.do_copy = true;

            cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
        }
        return;
    }

    const bool has_qnext_state = llama_kv_has_qnext_state_storage(cache);
    if (has_qnext_state &&
            llama_kv_qnext_seq_id_in_range(cache, seq_id_dst) &&
            llama_kv_qnext_seq_id_in_range(cache, seq_id_src) &&
            (uint32_t) seq_id_dst < cache.size &&
            (uint32_t) seq_id_src < cache.size) {
        seq_id_src = cache.cells[seq_id_src].src;
        GGML_ASSERT((uint32_t) seq_id_src < cache.size);

        cache.cells[seq_id_dst].src = seq_id_src;
        cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
        cache.do_copy = true;
    }

    // otherwise, this is the KV cache of a Transformer-like model

    cache.head = 0;

    for (uint32_t i = 0; i < cache.size; ++i) {
        if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
            cache.cells[i].seq_id.insert(seq_id_dst);
        }
    }
}

static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
    uint32_t new_head = cache.size;
    const bool has_qnext_state = llama_kv_has_qnext_state_storage(cache);

    for (uint32_t i = 0; i < cache.size; ++i) {
        if (!cache.cells[i].has_seq_id(seq_id)) {
            if (cache.cells[i].pos >= 0) cache.used--;
            cache.cells[i].pos = -1;
            if (has_qnext_state) {
                cache.cells[i].src = i;
            }
            cache.cells[i].seq_id.clear();
            if (new_head == cache.size) new_head = i;
        } else {
            cache.cells[i].seq_id.clear();
            cache.cells[i].seq_id.insert(seq_id);
        }
    }

    // If we freed up a slot, set head to it so searching can start there.
    if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
}

static void llama_kv_cache_seq_add(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1,
                    llama_pos   delta) {
    uint32_t new_head = cache.size;

    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
    // If there is no range then return early to avoid looping over the cache.
    if (p0 == p1) return;

    if (cache.recurrent) {
        // for Mamba-like models, only the pos needs to be shifted
        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
            llama_kv_cell & cell = cache.cells[seq_id];
            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
                cell.pos += delta;
            }
        }
        return;
    }

    for (uint32_t i = 0; i < cache.size; ++i) {
        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
            cache.has_shift = true;
            cache.cells[i].pos   += delta;
            cache.cells[i].delta += delta;

            if (cache.cells[i].pos < 0) {
                if (!cache.cells[i].is_empty()) {
                    cache.used--;
                }
                cache.cells[i].pos = -1;
                cache.cells[i].seq_id.clear();
                if (new_head == cache.size) {
                    new_head = i;
                }
            }
        }
    }

    // If we freed up a slot, set head to it so searching can start there.
    // Otherwise we just start the next search from the beginning.
    cache.head = new_head != cache.size ? new_head : 0;
}

static void llama_kv_cache_seq_div(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1,
                          int   d) {
    if (p0 < 0) p0 = 0;
    if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
    // If there is no range then return early to avoid looping over the cache.
    if (p0 == p1) return;

    if (cache.recurrent) {
        // for Mamba-like models, only the pos needs to be changed
        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
            llama_kv_cell & cell = cache.cells[seq_id];
            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
                cell.pos /= d;
            }
        }
        return;
    }

    for (uint32_t i = 0; i < cache.size; ++i) {
        if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
            cache.has_shift = true;

            {
                llama_pos p_old = cache.cells[i].pos;
                cache.cells[i].pos   /= d;
                cache.cells[i].delta += cache.cells[i].pos - p_old;
            }
        }
    }
}

static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
    llama_pos result = 0;

    for (uint32_t i = 0; i < cache.size; ++i) {
        if (cache.cells[i].has_seq_id(seq_id)) {
            result = std::max(result, cache.cells[i].pos);
        }
    }

    return result;
}

static llama_pos llama_kv_cache_seq_pos_min(struct llama_kv_cache & cache, llama_seq_id seq_id) {
    llama_pos result = -1;

    for (uint32_t i = 0; i < cache.size; ++i) {
        if (cache.cells[i].has_seq_id(seq_id)) {
            result = cache.cells[i].pos;
            break;
        }
    }

    return result;
}

static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
    cache.do_defrag = true;
}

static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
    // the FA kernels require padding to avoid extra runtime boundary checks
    return cparams.flash_attn ? 256u : 32u;
}

//
// model loading and saving
//

//
// load LLaMA models
//

void llm_load_arch(llama_model_loader & ml, llama_model & model) {
    model.arch = ml.get_arch();
    if (model.arch == LLM_ARCH_UNKNOWN) {
        throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
    }
}

static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    const auto & hparams = model.hparams;
    const auto & vocab   = model.vocab;

    const char * rope_scaling_type = hparams.rope_scaling_type_name(hparams.rope_scaling_type_train);

    auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
        bool is_var = false;

        std::vector<uint32_t> v;
        for (uint32_t i = 0; i < n; ++i) {
            v.push_back(f(i));
            if (v[i] != v[0]) {
                is_var = true;
            }
        }

        std::stringstream ss;

        if (is_var) {
            ss << "[";
            for (uint32_t i = 0; i < n; ++i) {
                ss << v[i];
                if (i < n - 1) {
                    ss << ", ";
                }
            }
            ss << "]";
        } else {
            ss << v[0];
        }

        return ss.str();
    };

    // hparams
    LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver));
    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, llama_model_arch_name(model.arch));

    if (!hparams.vocab_only) {
        LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
        LLAMA_LOG_INFO("%s: n_embd           = %u\n",     __func__, hparams.n_embd);
        LLAMA_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
        LLAMA_LOG_INFO("%s: n_head           = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_rot            = %u\n",     __func__, hparams.n_rot);
        LLAMA_LOG_INFO("%s: n_swa            = %u\n",     __func__, hparams.n_swa);
        LLAMA_LOG_INFO("%s: n_swa_pattern    = %u\n",     __func__, hparams.n_swa_pattern);
        LLAMA_LOG_INFO("%s: n_embd_head_k    = %u\n",     __func__, hparams.n_embd_head_k(0));
        LLAMA_LOG_INFO("%s: n_embd_head_v    = %u\n",     __func__, hparams.n_embd_head_v(0));
        LLAMA_LOG_INFO("%s: n_gqa            = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_embd_k_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_embd_v_gqa     = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: f_norm_eps       = %.1e\n",   __func__, hparams.f_norm_eps);
        LLAMA_LOG_INFO("%s: f_norm_rms_eps   = %.1e\n",   __func__, hparams.f_norm_rms_eps);
        LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
        LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
        LLAMA_LOG_INFO("%s: f_logit_scale    = %.1e\n",   __func__, hparams.f_logit_scale);
        LLAMA_LOG_INFO("%s: n_ff             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_expert         = %u\n",     __func__, hparams.n_expert);
        LLAMA_LOG_INFO("%s: n_expert_used    = %u\n",     __func__, hparams.n_expert_used);
        LLAMA_LOG_INFO("%s: causal attn      = %d\n",     __func__, hparams.causal_attn);
        LLAMA_LOG_INFO("%s: pooling type     = %d\n",     __func__, hparams.pooling_type);
        LLAMA_LOG_INFO("%s: rope type        = %d\n",     __func__, hparams.rope_type);
        LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
        // MRoPE (Multi-axis Rotary Position Embedding) sections
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
            LLAMA_LOG_INFO("%s: mrope sections   = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
        }
        LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
        LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
        LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
        LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
        LLAMA_LOG_INFO("%s: ssm_n_group      = %u\n",     __func__, hparams.ssm_n_group);
    }

    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
    if (ml.n_elements >= 1e12) {
        LLAMA_LOG_INFO("%s: model params     = %.3f T\n", __func__, ml.n_elements*1e-12);
    } else if (ml.n_elements >= 1e9) {
        LLAMA_LOG_INFO("%s: model params     = %.3f B\n", __func__, ml.n_elements*1e-9);
    } else if (ml.n_elements >= 1e6) {
        LLAMA_LOG_INFO("%s: model params     = %.3f M\n", __func__, ml.n_elements*1e-6);
    } else {
        LLAMA_LOG_INFO("%s: model params     = %.3f K\n", __func__, ml.n_elements*1e-3);
    }
    if (ml.n_bytes < GiB) {
        LLAMA_LOG_INFO("%s: model size       = %.3f MiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0,        ml.n_bytes*8.0/ml.n_elements);
    } else {
        LLAMA_LOG_INFO("%s: model size       = %.3f GiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
    }
    {
        auto n_bytes = ml.n_bytes;
        auto n_elements = ml.n_elements;
        auto meta_tke = ml.get_tensor_meta("token_embd.weight");
        auto meta_out = ml.get_tensor_meta("output.weight");
        if (meta_tke && meta_out) {
            n_bytes -= ggml_nbytes(meta_tke);
            n_elements -= ggml_nelements(meta_tke);
            n_bytes -= ggml_nbytes(meta_out);
            n_elements -= ggml_nelements(meta_out);
            if (n_bytes < GiB) {
                LLAMA_LOG_INFO("%s: repeating layers = %.3f MiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0,        n_bytes*8.0/n_elements);
            } else {
                LLAMA_LOG_INFO("%s: repeating layers = %.3f GiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
            }
            if (ml.n_elements >= 1e9) {
                LLAMA_LOG_INFO(", %.3f B parameters)\n", n_elements*1e-9);
            } else {
                LLAMA_LOG_INFO(", %.3f M parameters)\n", n_elements*1e-6);
            }
        }
    }

    // general kv
    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());

    if (model.is_mla_model()) {
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
        LLAMA_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
        LLAMA_LOG_INFO("%s: n_lora_kv            = %d\n",     __func__, hparams.n_lora_kv);
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((enum llm_expert_gating_func_type) hparams.expert_gating_func));
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
    }

    if (model.arch == LLM_ARCH_QWEN2MOE) {
        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
        LLAMA_LOG_INFO("%s: n_ff_shexp       = %d\n",     __func__, hparams.n_ff_shexp);
    }

    if (model.arch == LLM_ARCH_QWEN3MOE || model.arch == LLM_ARCH_OPENAI_MOE || model.arch == LLM_ARCH_QWEN3VLMOE) {
        LLAMA_LOG_INFO("%s: n_ff_exp         = %d\n",     __func__, hparams.n_ff_exp);
    }

    if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
        LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
        LLAMA_LOG_INFO("%s: f_residual_scale  = %f\n", __func__, hparams.f_residual_scale);
        LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
    }

    if (model.arch == LLM_ARCH_BAILINGMOE2) {
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
        LLAMA_LOG_INFO("%s: n_ff_shexp           = %d\n",     __func__, hparams.n_ff_shexp);
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
        LLAMA_LOG_INFO("%s: n_expert_groups      = %d\n",     __func__, hparams.n_expert_groups);
        LLAMA_LOG_INFO("%s: n_group_used         = %d\n",     __func__, hparams.n_group_used);
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llm_expert_gating_func_type) hparams.expert_gating_func));
        LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n",     __func__, hparams.nextn_predict_layers);
    }

    vocab.print_info();

}

static void llm_requantize_output_tensor(llama_model & model, ggml_type new_type) {
    if (new_type == GGML_TYPE_COUNT || !model.output) return;
    if (model.output_mtp && model.output_mtp != model.output) {
        LLAMA_LOG_WARN("%s: MTP output tensor is already present => not requantizing\n", __func__);
        return;
    }
    if (model.output->type == new_type) {
        LLAMA_LOG_WARN("%s: output tensor is already of type %s => not requantizing\n", __func__, ggml_type_name(new_type));
    }
    auto [other_type, n_interleaved] = interleaved_properties(new_type);
    if (model.output->ne[1] % n_interleaved != 0) {
        LLAMA_LOG_WARN("%s: number of rows %ld is not a multiple of %d row interleaving for %s\n", __func__,
                model.output->ne[1], n_interleaved, ggml_type_name(new_type));
        LLAMA_LOG_WARN("%s: using %s instead of %s\n", __func__, ggml_type_name(other_type), ggml_type_name(new_type));
        new_type = other_type;
        n_interleaved = 1;
    }
    auto nbytes_orig = ggml_nbytes(model.output);
    auto row_size    = ggml_row_size(new_type, model.output->ne[0]);
    auto nbytes_new  = row_size*ggml_nrows(model.output);
    if (nbytes_new >= nbytes_orig) {
        LLAMA_LOG_WARN("%s: if requantized to %s the output tensor size would be %zu, which is >= the current size %zu => not requantizing\n", __func__, ggml_type_name(new_type), nbytes_new, nbytes_orig);
        return;
    }

    LLAMA_LOG_INFO("====== Creating extra output tensor of type %s for MTP usage. Additional memory required is %.2f MiB\n",
            ggml_type_name(new_type), nbytes_new/1024./1024.);

    bool is_host = ggml_backend_buffer_is_host(model.output->buffer);

    auto tensor_data = model.output->data;
    std::vector<char> tensor_data_buf;
    if (!is_host) {
        tensor_data_buf.resize(nbytes_orig);
        ggml_backend_tensor_get(model.output, tensor_data_buf.data(), 0, nbytes_orig);
        tensor_data = tensor_data_buf.data();
    }

    auto tt_new  = ggml_internal_get_type_traits(new_type);
    auto new_output = std::make_unique<ggml_tensor>(*model.output);
    new_output->type = new_type;
    new_output->nb[0] = tt_new.type_size;
    new_output->nb[1] = row_size;
    new_output->nb[2] = new_output->nb[1] * new_output->ne[1];
    new_output->nb[3] = new_output->nb[2] * new_output->ne[2];
    GGML_ASSERT(ggml_nbytes(new_output.get()) == nbytes_new);
    new_output->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(model.output->buffer), nbytes_new);
    new_output->data   = ggml_backend_buffer_get_base(new_output->buffer);
    new_output->op     = GGML_OP_NONE;
    for (int j = 0; j < GGML_MAX_SRC; ++j) new_output->src[j] = nullptr;
    ggml_set_name(new_output.get(), "output_extra.weight");
    ggml_backend_buffer_set_usage(new_output->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

    std::vector<char> new_data_buf;
    char * new_data = (char *)new_output->data;
    if (!is_host) {
        new_data_buf.resize(nbytes_new);
        new_data = new_data_buf.data();
    }

    int nthread = std::max<int>(1, std::thread::hardware_concurrency()/2);

    auto compute = [t = model.output, tensor_data, new_data, nthread, new_type, n_interleaved] (int ith) {
        std::vector<float> work(t->ne[0]*n_interleaved);
        auto tt_orig = ggml_internal_get_type_traits(t->type);
        auto tt_new  = ggml_internal_get_type_traits(new_type);
        iqk_quantize_any(int(t->type), int(new_type),
                t->ne[0], t->ne[1], t->ne[2], t->ne[3],
                t->nb[0], t->nb[1], t->nb[2], t->nb[3],
                tensor_data, new_data, work.data(), tt_orig.to_float, tt_new.from_float, ith, nthread);
    };
    std::vector<std::thread> workers(nthread-1);
    for (int it = 0; it < nthread-1; ++it) workers[it] = std::thread(compute, it);
    compute(nthread-1);
    for (auto & w : workers) w.join();

    if (!is_host) {
        ggml_backend_tensor_set(new_output.get(), new_data, 0, nbytes_new);
    }

    model.output_mtp_ptr = std::move(new_output);
    model.output_mtp     = model.output_mtp_ptr.get();

}

static void llm_prepare_mla(llama_model & model, int mla) {
    if (!model.is_mla_model()) return;
    const auto& hparams = model.hparams;
    const int n_layer = model.layers.size();
    int n_to_compute = 0;
    for (auto& l : model.layers) {
        if (!l.wk_b) ++n_to_compute;
    }
    if (mla > 0 && n_to_compute > 0) {
        // Prepare wk_b tensors to enable MLA usage also for model files that do not include
        // the wk_b tensors (because, e.g., they were converted using mainline llama.cpp)
        // We do it here because otherwise wkv_b may get run-time-repacked, which will make
        // preparation of wk_b impossible. It also has the benefit that wk_b will get automatically
        // run-time repacked if the rtr option is set. The downside is that we will prepare wk_b
        // even if it is not needed (because MLA is not being used). If we wanted to avoid
        // computing wk_b from wkv_b if not needed, we would need to propagate the context parameters
        // to the model loading function. On the other hand, in some hypothetical bright future,
        // where we are able to use the optimum settings for the computation, which for DeepSeekV3/R1/Lite
        // is no MLA + FA for prompt processing, and MLA + FA for token generation, it would be useful
        // to change the MLA setting on the fly, depending on context. In that case, having prepared
        // the MLA tensors here is the right ting to do^TM.
        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k(0) - hparams.n_rot;
        const uint32_t kv_lora_rank = hparams.n_lora_kv;
        const int32_t n_embd_head_v = hparams.n_embd_head_v(0);
        const int32_t n_head        = hparams.n_head(0);
        std::vector<uint8_t> work_data;
        LLAMA_LOG_INFO("============ %s: need to compute %d wk_b/wv_b tensors\n", __func__, n_to_compute);
        for (int il = 1; il < n_layer; ++il) {
            // Somehow the number of heads is being defined as being per layer. Not sure why this is the
            // case, but for now we do not support strange models that have different numbers of heads
            // in different model layers.
            if ((int)hparams.n_head(il) != n_head) throw std::runtime_error("Unsupported configuration");
        }
        size_t max_wkv_size = 0;
        size_t max_wk_size = 0;
        for (auto& l : model.layers) {
            if (!l.wk_b) {
                auto new_type = ggml_is_quantized(l.wkv_b->type) ? GGML_TYPE_Q8_0 : l.wkv_b->type;
                auto size = ggml_row_size(new_type, n_embd_head_qk_nope)*kv_lora_rank*n_head;
                max_wk_size = std::max(max_wk_size, size);
                if (!ggml_backend_buffer_is_host(l.wkv_b->buffer)) {
                    max_wkv_size = std::max(max_wkv_size, ggml_nbytes(l.wkv_b));
                }
            }
        }
        // tensor_data layout: [wk_b_f32 | wk_b_f32_t | wk_b | wk_b_pp]. wk_b_pp slot is only
        // populated under -sm graph/attn (pp_opt-favoring layout); allocated unconditionally
        // for simplicity (one max_wk_size buffer).
        auto context_size = 2*max_wk_size + 2*n_embd_head_qk_nope*kv_lora_rank*n_head*sizeof(float);
        context_size *= 2; // just in case;
        std::vector<uint8_t> wkv_buffer;
        if (max_wkv_size > 0) wkv_buffer.resize(max_wkv_size);
        // So, transposing tensors and then making them contiguous as needed for wk_b may or may not
        // be supported on all backends. Hence, to be sure that the preparation of wk_b will
        // work correctly, we do it on the CPU backend. We then copy the resulting tensor data to
        // the bacikend where wkv_b is stored.
        ggml_init_params params{context_size, nullptr, true};
        auto ctx = ggml_init(params);
        auto graph = ggml_new_graph_custom(ctx, 8, false);
        std::vector<uint8_t> tensor_data(2*n_embd_head_qk_nope*kv_lora_rank*n_head*sizeof(float) + 2*max_wk_size);
        for (int il = 0; il < n_layer; ++il) {
            auto& l = model.layers[il];
            if (l.wk_b) continue;
            auto wkv_b = *l.wkv_b;
            if (!ggml_backend_buffer_is_host(l.wkv_b->buffer)) {
                ggml_backend_tensor_get(l.wkv_b, wkv_buffer.data(), 0, ggml_nbytes(l.wkv_b));
                wkv_b.data = wkv_buffer.data();
            }
            auto wk_b_view = ggml_view_3d(ctx, &wkv_b, kv_lora_rank, n_embd_head_qk_nope, n_head,
                    l.wkv_b->nb[1], l.wkv_b->nb[1]*(n_embd_head_qk_nope + n_embd_head_v), 0);
            auto wk_b_f32 = ggml_cast(ctx, wk_b_view, GGML_TYPE_F32);
            wk_b_f32->data = tensor_data.data();
            auto wk_b_f32_tview = ggml_transpose(ctx, wk_b_f32);
            auto wk_b_f32_t = ggml_cont(ctx, wk_b_f32_tview);
            wk_b_f32_t->data = (char *)wk_b_f32->data + ggml_nbytes(wk_b_f32);

            auto new_type = ggml_is_quantized(wkv_b.type) ?
                wkv_b.type >= GGML_TYPE_Q4_0_R8 && wkv_b.type <= GGML_TYPE_Q8_K_R8 ? GGML_TYPE_Q8_0_R8 : GGML_TYPE_Q8_0 : wkv_b.type;
            auto wk_b = ggml_cast(ctx, wk_b_f32_t, new_type);
            wk_b->data = (char *)wk_b_f32_t->data + ggml_nbytes(wk_b_f32_t);

            ggml_build_forward_expand(graph, wk_b);

            auto plan = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
            if (plan.work_size > work_data.size()) work_data.resize(plan.work_size);
            plan.work_data = work_data.data();

            auto status = ggml_graph_compute(graph, &plan);
            if (status != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute wk_b");

            auto name = std::string{"blk."} + std::to_string(il) + ".attn_k_b.weight";

            // Per-head split wk_b/wv_b under -sm graph/attn so each rank's batched matmul
            // reads only its share of heads (split_dim=2), mirroring prepare_split_tensors.
            const bool tp_replicate =
                (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)
                && l.wo && l.wo->extra;

            auto materialize = [&](ggml_tensor * source,
                                   std::unique_ptr<ggml_tensor> & computed,
                                   std::vector<std::unique_ptr<ggml_tensor>> & replicas,
                                   llama_split_tensor & split,
                                   const std::string & tname) -> ggml_tensor * {
                if (tp_replicate) {
                    auto wo_split = (const ggml_split_tensor_t *)l.wo->extra;
                    const int n_device = wo_split->n_device;
                    const int64_t n_embd_head_v_full = hparams.n_embd_head_v_full;

                    std::vector<int> head_offsets(n_device + 1, 0);
                    for (int idx = 0; idx < n_device; ++idx) {
                        int n_h_id = 0;
                        if (wo_split->splits[idx]) {
                            n_h_id = (int)(wo_split->splits[idx]->ne[0] / n_embd_head_v_full);
                        }
                        head_offsets[idx + 1] = head_offsets[idx] + n_h_id;
                    }

                    computed = std::make_unique<ggml_tensor>(*source);
                    computed->buffer = nullptr;
                    computed->data   = nullptr;
                    computed->op = GGML_OP_NONE;
                    for (int j = 0; j < GGML_MAX_SRC; ++j) computed->src[j] = nullptr;
                    ggml_set_name(computed.get(), tname.c_str());

                    replicas.resize(n_device);
                    split.tensor_splits.assign(n_device, nullptr);

                    const size_t head_block_bytes = source->nb[2];

                    for (int id = 0; id < n_device; ++id) {
                        if (!wo_split->splits[id] || !wo_split->splits[id]->buffer) continue;
                        const int head_offset  = head_offsets[id];
                        const int n_head_local = head_offsets[id + 1] - head_offset;
                        if (n_head_local <= 0) continue;

                        const size_t slice_bytes = (size_t)n_head_local * head_block_bytes;
                        auto dev_buft = ggml_backend_buffer_get_type(wo_split->splits[id]->buffer);
                        auto dev_buf  = ggml_backend_buft_alloc_buffer(dev_buft, slice_bytes);
                        if (!dev_buf) {
                            throw std::runtime_error("Failed to allocate per-rank buffer for " + tname);
                        }
                        ggml_backend_buffer_set_usage(dev_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
                        model.bufs.push_back(dev_buf);
                        // Intentionally not updating mem_used[id] here: llm_prepare_mla
                        // runs post-load, after distribute_mla_tensors has completed its
                        // allocation rounds, so no downstream allocator consults mem_used
                        // anymore.

                        replicas[id] = std::make_unique<ggml_tensor>(*source);
                        auto rep = replicas[id].get();
                        rep->ne[2] = n_head_local;
                        rep->nb[3] = rep->nb[2] * (size_t)rep->ne[2];
                        rep->buffer = dev_buf;
                        rep->data   = ggml_backend_buffer_get_base(dev_buf);
                        rep->op = GGML_OP_NONE;
                        for (int j = 0; j < GGML_MAX_SRC; ++j) rep->src[j] = nullptr;
                        rep->view_src = nullptr;
                        rep->view_offs = 0;
                        rep->extra = nullptr;
                        ggml_set_name(rep, (tname + "." + std::to_string(id)).c_str());

                        const uint8_t * src_bytes = (const uint8_t *)source->data + (size_t)head_offset * head_block_bytes;
                        ggml_backend_tensor_set(rep, src_bytes, 0, slice_bytes);
                        if (ggml_backend_buffer_is_host(rep->buffer)) {
                            iqk_modify_tensor(rep);
                        }
                        split.tensor_splits[id] = rep;
                    }

                    split.ggml.n_device  = n_device;
                    split.ggml.split_dim = 2;
                    split.ggml.splits    = split.tensor_splits.data();
                    computed->extra = (void *)&split.ggml;

                    LLAMA_LOG_INFO("Computed %s as %d x %d x %d of type %s, split across %d devices on dim=2\n",
                            tname.c_str(), (int)source->ne[0], (int)source->ne[1], (int)source->ne[2],
                            ggml_type_name(source->type), n_device);
                } else {
                    computed = std::make_unique<ggml_tensor>(*source);
                    computed->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(l.wkv_b->buffer), ggml_nbytes(source));
                    computed->data   = ggml_backend_buffer_get_base(computed->buffer);
                    // GGML_OP_NONE so the backend doesn't try to find the (now-freed) parents of source.
                    computed->op = GGML_OP_NONE;
                    for (int j = 0; j < GGML_MAX_SRC; ++j) computed->src[j] = nullptr;
                    ggml_set_name(computed.get(), tname.c_str());
                    ggml_backend_buffer_set_usage(computed->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
                    ggml_backend_tensor_set(computed.get(), source->data, 0, ggml_nbytes(source));
                    if (ggml_backend_buffer_is_host(computed->buffer)) {
                        iqk_modify_tensor(computed.get());
                    }

                    LLAMA_LOG_INFO("Computed %s as %d x %d x %d of type %s and stored in buffer %s\n",
                            tname.c_str(), (int)source->ne[0], (int)source->ne[1], (int)source->ne[2],
                            ggml_type_name(source->type), ggml_backend_buffer_name(computed->buffer));
                }
                model.tensors_by_name.push_back(std::make_pair(tname, computed.get()));
                return computed.get();
            };

            // pp_opt-favoring wk_b_pp = quantize(wk_b_f32) directly (absorb-favoring
            // wk_b above is its transpose). Not gated on mla — wk_b_pp shares wk_b_f32
            // with the wk_b synthesis above and skipping it breaks absorb on some
            // quant combinations. Second pass below gates the mla=1 saving for
            // GGUFs that ship wk_b directly.
            if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) {
                ggml_graph_clear(graph);
                auto wk_b_pp = ggml_cast(ctx, wk_b_f32, new_type);
                wk_b_pp->data = (char *)wk_b->data + ggml_nbytes(wk_b);
                // tensor_data layout is [wk_b_f32 | wk_b_f32_t | wk_b | wk_b_pp]. wk_b and
                // wk_b_pp are the same quant type and same shape, so wk_b_pp's slot is
                // exactly one wk_b-sized block past wk_b. The tensor_data vector was sized
                // 2*F32 + 2*max_wk_size, which covers both wk_b and wk_b_pp.
                GGML_ASSERT((char *)wk_b_pp->data + ggml_nbytes(wk_b_pp) <=
                            (char *)tensor_data.data() + tensor_data.size());
                ggml_build_forward_expand(graph, wk_b_pp);
                auto plan_pp = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
                if (plan_pp.work_size > work_data.size()) work_data.resize(plan_pp.work_size);
                plan_pp.work_data = work_data.data();
                auto status_pp = ggml_graph_compute(graph, &plan_pp);
                if (status_pp != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute attn_kv_b");
                auto name_pp = std::string{"blk."} + std::to_string(il) + ".attn_kv_b.weight";
                l.wk_b_pp = materialize(wk_b_pp, l.computed_wk_b_pp, l.computed_wk_b_pp_replicas, l.split_wk_b_pp, name_pp);
                ggml_graph_clear(graph);
            }

            l.wk_b = materialize(wk_b, l.computed_wk_b, l.computed_wk_b_replicas, l.split_wk_b, name);

            ggml_graph_clear(graph);
            auto wv_b = ggml_cont(ctx, ggml_view_3d(ctx, &wkv_b, kv_lora_rank, n_embd_head_v, n_head,
                        l.wkv_b->nb[1], l.wkv_b->nb[1]*(n_embd_head_qk_nope + n_embd_head_v), l.wkv_b->nb[1]*n_embd_head_qk_nope));
            wv_b->data = tensor_data.data();
            ggml_build_forward_expand(graph, wv_b);
            plan = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
            if (plan.work_size > work_data.size()) work_data.resize(plan.work_size);
            plan.work_data = work_data.data();
            status = ggml_graph_compute(graph, &plan);
            if (status != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute wv_b");

            name = std::string{"blk."} + std::to_string(il) + ".attn_v_b.weight";

            l.wv_b = materialize(wv_b, l.computed_wv_b, l.computed_wv_b_replicas, l.split_wv_b, name);

            ggml_graph_clear(graph);
        }
        ggml_free(ctx);
    }

    // Second pass: for layers where wk_b came from the GGUF directly, produce
    // wk_b_pp here. Only under -sm graph/attn AND mla > 1; mla=1 skips pp_opt.
    int n_computed = 0;
    if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && mla > 1) {
        int n_pp_to_compute = 0;
        for (auto & l : model.layers) {
            if (l.wk_b && !l.wk_b_pp) ++n_pp_to_compute;
        }
        if (n_pp_to_compute > 0) {
            const uint32_t n_embd_head_qk_nope_pp = hparams.n_embd_head_k(0) - hparams.n_rot;
            const uint32_t kv_lora_rank_pp = hparams.n_lora_kv;
            const int32_t  n_head_pp       = hparams.n_head(0);

            size_t max_wk_size_pp = 0;
            for (auto & l : model.layers) {
                if (l.wk_b && !l.wk_b_pp) {
                    max_wk_size_pp = std::max(max_wk_size_pp, ggml_nbytes(l.wk_b));
                }
            }
            auto context_size_pp = 4 * max_wk_size_pp +
                4 * (size_t)n_embd_head_qk_nope_pp * kv_lora_rank_pp * n_head_pp * sizeof(float);
            context_size_pp *= 2;

            std::vector<uint8_t> work_data_pp;
            // Hoist the full-wk_b assembly buffer outside the per-layer loop so we
            // don't re-allocate ~max_wk_size_pp bytes per layer.
            std::vector<uint8_t> full_wk_b_host_buf(max_wk_size_pp);
            // tensor_data_pp holds, in order: [wk_b_f32 | wk_b_pp_f32 | wk_b_pp_q].
            // F32 slots size by n_embd_head_qk_nope * kv_lora_rank * n_head * 4; the
            // requantized slot fits in max_wk_size_pp. After the N->1 graph compute
            // consolidation we no longer need a second wk_b-sized slot.
            std::vector<uint8_t> tensor_data_pp(
                2 * (size_t)n_embd_head_qk_nope_pp * kv_lora_rank_pp * n_head_pp * sizeof(float) +
                max_wk_size_pp);

            ggml_init_params params_pp{context_size_pp, nullptr, true};
            auto ctx_pp = ggml_init(params_pp);
            auto graph_pp = ggml_new_graph_custom(ctx_pp, 8, false);
            LLAMA_LOG_INFO("============ %s: need to compute %d attn_kv_b tensors\n", __func__, n_pp_to_compute);

            for (int il = 0; il < n_layer; ++il) {
                auto & l = model.layers[il];
                if (!l.wk_b || l.wk_b_pp) continue;

                // Under -sm graph/attn (the outer block's gate), distribute_mla_tensors
                // always populates l.wk_b->extra and l.wo->extra. If either is missing,
                // we're in a degenerate config and pp_opt falls back to the runtime
                // transpose in build_deepseek2.cpp; skip here.
                if (!l.wo || !l.wo->extra || !l.wk_b->extra) continue;

                ++n_computed;

                // Per-rank wk_b slices: each lives on a single device as a regular CUDA
                // tensor (not the split-buffer wrapper which lacks a get_tensor impl for
                // split_dim=2). Read each rank's slice independently.
                auto wk_b_split = (const ggml_split_tensor_t *)l.wk_b->extra;
                auto wo_split   = (const ggml_split_tensor_t *)l.wo->extra;
                const int n_device = wo_split->n_device;

                auto name = std::string{"blk."} + std::to_string(il) + ".attn_kv_b.weight";

                // Build a placeholder wk_b_pp_q on host with the full [kv_lora_rank, qk_nope, n_head]
                // shape (only used as a template for cloning per-rank metadata; no data filled).
                ggml_tensor wk_b_pp_template = *l.wk_b;
                wk_b_pp_template.ne[0] = (int64_t)kv_lora_rank_pp;
                wk_b_pp_template.ne[1] = (int64_t)n_embd_head_qk_nope_pp;
                wk_b_pp_template.ne[2] = (int64_t)n_head_pp;
                wk_b_pp_template.nb[0] = ggml_type_size(l.wk_b->type);
                wk_b_pp_template.nb[1] = wk_b_pp_template.nb[0] * (wk_b_pp_template.ne[0] / ggml_blck_size(l.wk_b->type));
                wk_b_pp_template.nb[2] = wk_b_pp_template.nb[1] * wk_b_pp_template.ne[1];
                wk_b_pp_template.nb[3] = wk_b_pp_template.nb[2] * wk_b_pp_template.ne[2];

                l.computed_wk_b_pp = std::make_unique<ggml_tensor>(wk_b_pp_template);
                l.computed_wk_b_pp->buffer = nullptr;
                l.computed_wk_b_pp->data   = nullptr;
                l.computed_wk_b_pp->op = GGML_OP_NONE;
                for (int j = 0; j < GGML_MAX_SRC; ++j) l.computed_wk_b_pp->src[j] = nullptr;
                ggml_set_name(l.computed_wk_b_pp.get(), name.c_str());

                l.computed_wk_b_pp_replicas.resize(n_device);
                l.split_wk_b_pp.tensor_splits.assign(n_device, nullptr);
                const size_t per_head_pp_bytes = wk_b_pp_template.nb[2];

                // Build head_offsets[] from per-rank ne[2]; matches the layout used by
                // prepare_split_tensors(split_dim=2) on wk_b so that head_offsets[id]
                // points to the start of rank id's head range in the full wk_b layout.
                std::vector<int> head_offsets(n_device + 1, 0);
                for (int id = 0; id < n_device; ++id) {
                    int n_h_id = 0;
                    if (wk_b_split->splits[id]) {
                        n_h_id = (int)wk_b_split->splits[id]->ne[2];
                    }
                    head_offsets[id + 1] = head_offsets[id] + n_h_id;
                }

                // Read all per-rank wk_b slices into the hoisted host buffer ordered
                // by head_offset. The data layout on disk is per-head contiguous, so
                // sequential rank reads at byte offset head_offsets[id] * per_head_in_bytes
                // reconstitute the original full wk_b on host.
                const size_t per_head_in_bytes = l.wk_b->nb[2];
                const size_t full_wk_b_nbytes  = (size_t)head_offsets[n_device] * per_head_in_bytes;
                GGML_ASSERT(full_wk_b_nbytes <= full_wk_b_host_buf.size());
                for (int id = 0; id < n_device; ++id) {
                    if (!wk_b_split->splits[id]) continue;
                    auto wk_b_rank = wk_b_split->splits[id];
                    const size_t rank_nbytes = ggml_nbytes(wk_b_rank);
                    const size_t byte_offset = (size_t)head_offsets[id] * per_head_in_bytes;
                    GGML_ASSERT(byte_offset + rank_nbytes <= full_wk_b_nbytes);
                    ggml_backend_tensor_get(wk_b_rank, full_wk_b_host_buf.data() + byte_offset, 0, rank_nbytes);
                }

                // ONE graph compute on the full assembled wk_b: dequant -> transpose ->
                // requant. Per-rank slicing happens on the resulting host buffer.
                ggml_tensor full_host = *l.wk_b;
                full_host.data = full_wk_b_host_buf.data();
                auto f_f32 = ggml_cast(ctx_pp, &full_host, GGML_TYPE_F32);
                f_f32->data = tensor_data_pp.data();
                auto f_view = ggml_transpose(ctx_pp, f_f32);
                auto f_cont = ggml_cont(ctx_pp, f_view);
                f_cont->data = (char *)f_f32->data + ggml_nbytes(f_f32);
                auto f_q    = ggml_cast(ctx_pp, f_cont, l.wk_b->type);
                f_q->data   = (char *)f_cont->data + ggml_nbytes(f_cont);
                ggml_build_forward_expand(graph_pp, f_q);
                auto plan = ggml_graph_plan(graph_pp, std::thread::hardware_concurrency()/2);
                if (plan.work_size > work_data_pp.size()) work_data_pp.resize(plan.work_size);
                plan.work_data = work_data_pp.data();
                auto status = ggml_graph_compute(graph_pp, &plan);
                if (status != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute attn_kv_b");
                ggml_graph_clear(graph_pp);

                // Per-rank upload: f_q is the full wk_b_pp in [kv_lora_rank, qk_nope,
                // n_head] order. Per-head stride matches per_head_pp_bytes by construction.
                for (int id = 0; id < n_device; ++id) {
                    if (!wo_split->splits[id] || !wo_split->splits[id]->buffer) continue;
                    if (!wk_b_split->splits[id]) continue;
                    const int n_head_local = (int)wk_b_split->splits[id]->ne[2];
                    if (n_head_local <= 0) continue;

                    const size_t slice_bytes = (size_t)n_head_local * per_head_pp_bytes;
                    auto dev_buft = ggml_backend_buffer_get_type(wo_split->splits[id]->buffer);
                    auto dev_buf  = ggml_backend_buft_alloc_buffer(dev_buft, slice_bytes);
                    if (!dev_buf) {
                        throw std::runtime_error("Failed to allocate per-rank buffer for " + name);
                    }
                    ggml_backend_buffer_set_usage(dev_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
                    model.bufs.push_back(dev_buf);
                    // Same mem_used note as Path A: distribute_mla_tensors has already
                    // closed its books before llm_prepare_mla runs.

                    l.computed_wk_b_pp_replicas[id] = std::make_unique<ggml_tensor>(wk_b_pp_template);
                    auto rep = l.computed_wk_b_pp_replicas[id].get();
                    rep->ne[2] = n_head_local;
                    rep->nb[3] = rep->nb[2] * (size_t)rep->ne[2];
                    rep->buffer = dev_buf;
                    rep->data   = ggml_backend_buffer_get_base(dev_buf);
                    rep->op = GGML_OP_NONE;
                    for (int j = 0; j < GGML_MAX_SRC; ++j) rep->src[j] = nullptr;
                    rep->view_src = nullptr;
                    rep->view_offs = 0;
                    rep->extra = nullptr;
                    ggml_set_name(rep, (name + "." + std::to_string(id)).c_str());

                    const size_t byte_offset = (size_t)head_offsets[id] * per_head_pp_bytes;
                    ggml_backend_tensor_set(rep, (char *)f_q->data + byte_offset, 0, slice_bytes);
                    if (ggml_backend_buffer_is_host(rep->buffer)) {
                        iqk_modify_tensor(rep);
                    }
                    l.split_wk_b_pp.tensor_splits[id] = rep;
                }

                l.split_wk_b_pp.ggml.n_device  = n_device;
                l.split_wk_b_pp.ggml.split_dim = 2;
                l.split_wk_b_pp.ggml.splits    = l.split_wk_b_pp.tensor_splits.data();
                l.computed_wk_b_pp->extra = (void *)&l.split_wk_b_pp.ggml;
                model.tensors_by_name.push_back(std::make_pair(name, l.computed_wk_b_pp.get()));
                l.wk_b_pp = l.computed_wk_b_pp.get();

                LLAMA_LOG_INFO("Computed %s as %d x %d x %d of type %s, split across %d devices on dim=2\n",
                        name.c_str(),
                        (int)l.computed_wk_b_pp->ne[0],
                        (int)l.computed_wk_b_pp->ne[1],
                        (int)l.computed_wk_b_pp->ne[2],
                        ggml_type_name(l.computed_wk_b_pp->type), n_device);
            }
            ggml_free(ctx_pp);
        }
    }

    if (mla == 1 || (model.split_mode == LLAMA_SPLIT_MODE_GRAPH && n_computed == n_layer)) return;

    n_to_compute = 0;
    for (auto& l : model.layers) {
        if (l.wk_b && l.wv_b && !l.wkv_b) ++n_to_compute;
    }
    if (n_to_compute == 0) return;

    //
    // Prepare wkv_b tensors to enable MLA=2,3 usage also for model files that have been
    // crippled to the mainline llama.cpp MLA implementation (MLA=1 here).
    // We do it here because otherwise wk_b and wv_b may get run-time-repacked, which will make
    // preparation of wkv_b impossible. It also has the benefit that wkv_b will get automatically
    // run-time repacked if the rtr option is set.
    //
    const int32_t n_head        = hparams.n_head(0);
    std::vector<uint8_t> work_data;
    LLAMA_LOG_INFO("============ %s: need to compute %d wkv_b tensors\n", __func__, n_to_compute);
    for (int il = 1; il < n_layer; ++il) {
        // Somehow the number of heads is being defined as being per layer. Not sure why this is the
        // case, but for now we do not support strange models that have different numbers of heads
        // in different model layers.
        if ((int)hparams.n_head(il) != n_head) throw std::runtime_error("Unsupported configuration");
    }

    size_t context_size = ggml_tensor_overhead()*16*n_layer;

    ggml_init_params params{context_size, nullptr, true};
    auto ctx = ggml_init(params);
    auto graph = ggml_new_graph_custom(ctx, 8, false);

    std::vector<char> wk_buffer, wv_buffer;
    std::vector<char> tmp_buffer;
    for (int il = 0; il < n_layer; ++il) {
        auto& l = model.layers[il];
        if (l.wkv_b || !l.wk_b || !l.wv_b || (l.wo && l.wo->extra)) continue;
        auto wk_b = *l.wk_b;
        auto wv_b = *l.wv_b;
        if (!ggml_backend_buffer_is_host(l.wk_b->buffer)) {
            auto nbytes = ggml_nbytes(l.wk_b);
            if (wk_buffer.size() < nbytes) wk_buffer.resize(nbytes);
            ggml_backend_tensor_get(l.wk_b, wk_buffer.data(), 0, nbytes);
            wk_b.data = wk_buffer.data();
        }
        if (!ggml_backend_buffer_is_host(l.wv_b->buffer)) {
            auto nbytes = ggml_nbytes(l.wv_b);
            if (wv_buffer.size() < nbytes) wv_buffer.resize(nbytes);
            ggml_backend_tensor_get(l.wv_b, wv_buffer.data(), 0, nbytes);
            wv_b.data = wv_buffer.data();
        }

        auto n_wk = ggml_nelements(&wk_b);
        auto n_wv = ggml_nelements(&wv_b);

        size_t tot_size = 0;
        if (wk_b.type != GGML_TYPE_F32) {
            tot_size += n_wk*sizeof(float);
        }
        tot_size += n_wk*sizeof(float); // ggml_cont(ctx, ggml_transpose(ctx, wk_b_used));
        if (wv_b.type != GGML_TYPE_F32) {
            tot_size += n_wv*sizeof(float);
        }
        tot_size += (n_wk + n_wv)*sizeof(float); // ggml_concat(ctx, wk_b_transposed, wv_b_used, 0);
        tot_size += (n_wk + n_wv)*sizeof(float); // ggml_cast(ctx, wkv_b_f32, new_type);

        if (tmp_buffer.size() < tot_size) tmp_buffer.resize(tot_size);

        auto ptr = tmp_buffer.data();

        auto wk_b_used = &wk_b;
        if (wk_b.type != GGML_TYPE_F32) {
            wk_b_used = ggml_cast(ctx, &wk_b, GGML_TYPE_F32);
            wk_b_used->data = ptr;
            ptr += ggml_nbytes(wk_b_used);
        }
        auto wk_b_transposed = ggml_cont(ctx, ggml_transpose(ctx, wk_b_used));
        wk_b_transposed->data = ptr;
        ptr += ggml_nbytes(wk_b_transposed);

        auto wv_b_used = &wv_b;
        if (wv_b.type != GGML_TYPE_F32) {
            wv_b_used = ggml_cast(ctx, &wv_b, GGML_TYPE_F32);
            wv_b_used->data = ptr;
            ptr += ggml_nbytes(wv_b_used);
        }

        auto wkv_b_f32_3d = ggml_concat(ctx, wk_b_transposed, wv_b_used, 1);
        wkv_b_f32_3d->data = ptr;
        ptr += ggml_nbytes(wkv_b_f32_3d);

        auto wkv_b_f32 = ggml_view_2d(ctx, wkv_b_f32_3d, wkv_b_f32_3d->ne[0], wkv_b_f32_3d->ne[1]*wkv_b_f32_3d->ne[2],
                wkv_b_f32_3d->nb[1], 0);

        auto new_type = wk_b.type == GGML_TYPE_BF16 && wv_b.type == GGML_TYPE_BF16 ? GGML_TYPE_BF16
                      : wk_b.type == GGML_TYPE_F16  && wv_b.type == GGML_TYPE_F16  ? GGML_TYPE_F16
                      : GGML_TYPE_Q8_0;

        auto wkv_b = ggml_cast(ctx, wkv_b_f32, new_type);
        wkv_b->data = ptr;
        ptr += ggml_nbytes(wkv_b);

        ggml_build_forward_expand(graph, wkv_b);

        auto plan = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
        if (plan.work_size > work_data.size()) work_data.resize(plan.work_size);
        plan.work_data = work_data.data();

        auto status = ggml_graph_compute(graph, &plan);
        if (status != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute wkv_b");

        auto name = std::string{"blk."} + std::to_string(il) + ".attn_kv_b.weight";

        l.computed_wkv_b = std::make_unique<ggml_tensor>(*wkv_b);
        l.computed_wkv_b->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(l.wk_b->buffer), ggml_nbytes(wkv_b));
        l.computed_wkv_b->data   = ggml_backend_buffer_get_base(l.computed_wkv_b->buffer);
        l.computed_wkv_b->op = GGML_OP_NONE; // we absolutely need to do this, else the backend will attempt to find the parents
                                             // of wkv_b, which no longer exist, and will therefore crash.
        for (int j = 0; j < GGML_MAX_SRC; ++j) l.computed_wkv_b->src[j] = nullptr;
        ggml_set_name(l.computed_wkv_b.get(), name.c_str());
        ggml_backend_buffer_set_usage(l.computed_wkv_b->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
        ggml_backend_tensor_set(l.computed_wkv_b.get(), wkv_b->data, 0, ggml_nbytes(wkv_b));
        if (ggml_backend_buffer_is_host(l.computed_wkv_b->buffer)) {
            iqk_modify_tensor(l.computed_wkv_b.get());
        }

        l.wkv_b = l.computed_wkv_b.get();
        model.tensors_by_name.push_back(std::make_pair(name, l.wkv_b));

        LLAMA_LOG_INFO("Computed %s as %d x %d of type %s and stored in buffer %s\n", name.c_str(), (int)wkv_b->ne[0], (int)wkv_b->ne[1],
                    ggml_type_name(wkv_b->type), ggml_backend_buffer_name(l.computed_wkv_b->buffer));

        ggml_graph_clear(graph);
    }
    ggml_free(ctx);
}

// Fold the 64-block Hadamard into wv_b/wk_b_pp at init; build_deepseek2.cpp then
// skips the runtime cache_nope un-Hadamard. Math identity by H^T H = I.
static void llm_apply_khad_pretransform(llama_model & model) {
    if (model.khad_pretransformed) return;
    if (!model.is_mla_model()) return;

    // High-enough bpw to survive one quant->F32->H->quant roundtrip within PPL noise.
    // Cliff is ~2.7 bpw: IQ3_XXS (3.06) sits at +0.05 noise edge; IQ2_XS (2.31) drifts +0.20.
    // Below-cliff and unmeasured types fall back to the runtime cache_nope path.
    auto castable = [](ggml_type t) {
        return t == GGML_TYPE_F32 || t == GGML_TYPE_F16 || t == GGML_TYPE_BF16 ||
               t == GGML_TYPE_Q4_0 || t == GGML_TYPE_Q4_1 ||
               t == GGML_TYPE_Q5_0 || t == GGML_TYPE_Q5_1 ||
               t == GGML_TYPE_Q6_0 || t == GGML_TYPE_Q8_0 ||
               t == GGML_TYPE_IQ4_NL ||
               t == GGML_TYPE_Q4_K  || t == GGML_TYPE_Q5_K  || t == GGML_TYPE_Q6_K ||
               t == GGML_TYPE_IQ4_K || t == GGML_TYPE_IQ5_K ||
               t == GGML_TYPE_IQ4_KS|| t == GGML_TYPE_IQ4_KSS||
               t == GGML_TYPE_IQ5_KS;
    };

    const auto & hparams = model.hparams;
    const int64_t kv_lora_rank = hparams.n_lora_kv;
    if (kv_lora_rank <= 0 || kv_lora_rank % 64 != 0) return;

    // All-or-nothing: a partially-folded model would consume H-applied cache
    // through un-folded weights and produce wrong values.
    bool all_eligible = true;
    int n_layers_with_pp = 0;
    for (auto & l : model.layers) {
        if (!l.wv_b || !l.wk_b_pp) continue;
        ++n_layers_with_pp;
        ggml_type tv = l.wv_b->type;
        ggml_type tk = l.wk_b_pp->type;
        if (!castable(tv) || !castable(tk)) {
            all_eligible = false;
            break;
        }
    }
    if (!all_eligible || n_layers_with_pp == 0) {
        LLAMA_LOG_INFO("============ %s: skipping (no eligible wv_b/wk_b_pp; n_pp=%d)\n",
                       __func__, n_layers_with_pp);
        return;
    }

    auto fold_tensor = [&](ggml_tensor * t) -> bool {
        if (!t) return true;
        if (t->ne[0] != kv_lora_rank) return false;

        const size_t nbytes = ggml_nbytes(t);
        std::vector<uint8_t> host_in(nbytes);
        ggml_backend_tensor_get(t, host_in.data(), 0, nbytes);

        ggml_init_params ip{ ggml_tensor_overhead()*32 + ggml_graph_overhead(), nullptr, true };
        auto ctx = ggml_init(ip);
        auto graph = ggml_new_graph(ctx);

        ggml_tensor src_host = *t;
        src_host.data = host_in.data();
        src_host.op = GGML_OP_NONE;
        for (int j = 0; j < GGML_MAX_SRC; ++j) src_host.src[j] = nullptr;
        src_host.buffer = nullptr;
        src_host.extra = nullptr;
        src_host.view_src = nullptr;
        src_host.view_offs = 0;

        const size_t n_f32_bytes = (size_t)ggml_nelements(t) * sizeof(float);
        std::vector<uint8_t> f32_buf_a(n_f32_bytes);
        std::vector<uint8_t> f32_buf_b(n_f32_bytes);
        std::vector<uint8_t> out_buf(nbytes);

        auto src_f32 = ggml_cast(ctx, &src_host, GGML_TYPE_F32);
        src_f32->data = f32_buf_a.data();
        auto had     = ggml_hadamard(ctx, src_f32, 64);
        had->data    = f32_buf_b.data();
        auto out_q   = ggml_cast(ctx, had, t->type);
        out_q->data  = out_buf.data();

        ggml_build_forward_expand(graph, out_q);

        std::vector<uint8_t> work_data;
        auto plan = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
        if (plan.work_size > work_data.size()) work_data.resize(plan.work_size);
        plan.work_data = work_data.data();
        bool ok = (ggml_graph_compute(graph, &plan) == GGML_STATUS_SUCCESS);
        ggml_free(ctx);
        if (!ok) return false;

        ggml_backend_tensor_set(t, out_buf.data(), 0, nbytes);
        return true;
    };

    auto fold_split_or_single = [&](ggml_tensor * full) -> bool {
        if (!full) return true;
        if (full->extra) {
            auto split = (const ggml_split_tensor_t *)full->extra;
            for (int id = 0; id < split->n_device; ++id) {
                if (!split->splits[id]) continue;
                if (!fold_tensor(split->splits[id])) return false;
            }
            return true;
        }
        return fold_tensor(full);
    };

    int n_folded = 0;
    for (auto & l : model.layers) {
        if (!l.wv_b || !l.wk_b_pp) continue;
        if (!fold_split_or_single(l.wv_b))   { LLAMA_LOG_ERROR("%s: failed to fold wv_b\n",   __func__); return; }
        if (!fold_split_or_single(l.wk_b_pp)){ LLAMA_LOG_ERROR("%s: failed to fold wk_b_pp\n",__func__); return; }
        ++n_folded;
    }

    model.khad_pretransformed = (n_folded > 0);
    LLAMA_LOG_INFO("============ %s: folded H into wv_b/wk_b_pp on %d layers\n", __func__, n_folded);
}

static void llm_scale_gate_inp_s(llama_model & model, bool uses_mmap) {
    auto & hparams = model.hparams;
    LLAMA_LOG_INFO("%s: n_embd = %d\n", __func__, hparams.n_embd);
    std::vector<float> values(hparams.n_embd);
    float scale = 1.0f/sqrtf((float)hparams.n_embd);
    int n_host = 0;
    for (auto & l : model.layers) {
        auto gis = l.ffn_gate_inp_s;
        if (!gis) continue;
        bool is_host = ggml_backend_buffer_is_host(gis->buffer);
        if (is_host) {
            if (uses_mmap) {
                ++n_host;
            } else {
                auto val = (float *)gis->data;
                for (int j = 0; j < hparams.n_embd; ++j) val[j] *= scale;
            }
        } else {
            auto extra = (ggml_split_tensor_t *)gis->extra;
            if (extra) {
                ggml_backend_tensor_get(extra->splits[0], values.data(), 0, values.size()*sizeof(float));
            } else {
                ggml_backend_tensor_get(gis, values.data(), 0, values.size()*sizeof(float));
            }
            for (int j = 0; j < hparams.n_embd; ++j) values[j] *= scale;
            if (extra) {
                for (int id = 0; id < extra->n_device; ++id) {
                    ggml_backend_tensor_set(extra->splits[id], values.data(), 0, values.size()*sizeof(float));
                }
            } else {
                ggml_backend_tensor_set(gis, values.data(), 0, values.size()*sizeof(float));
            }
        }
    }
    if (n_host > 0) {
        model.aux_buffer.resize(n_host * hparams.n_embd * sizeof(float));
        auto ptr = model.aux_buffer.data();
        for (auto & l : model.layers) {
            auto gis = l.ffn_gate_inp_s;
            if (!gis) continue;
            if (ggml_backend_buffer_is_host(gis->buffer)) {
                auto val = (const float *)gis->data;
                for (int j = 0; j < hparams.n_embd; ++j) ptr[j] = val[j] * scale;
                gis->data = ptr;
                ptr += hparams.n_embd;
            }
        }
    }
}

// Backend (reg) enumeration
static bool striequals(const char* a, const char* b) {
    for (; *a && *b; a++, b++) {
        if (std::tolower(*a) != std::tolower(*b)) {
            return false;
        }
    }
    return *a == *b;
}

ggml_backend_t llama_context::ggml_backend_by_name(const char* name) {
    for (auto backend : backends) {
        const char* backend_name = ggml_backend_name(backend);
        if (striequals(backend_name, name)) {
            return backend;
        }
    }
    return nullptr;
}

static bool item_in_list(const std::vector<std::string>& devices, const char* name) {
    for (auto& device : devices) {
        if (striequals(device.c_str(), name)) {
            return true;
        }
    }
    return false;
}

static void ggml_backend_add_from_device(llama_context* ctx, ggml_backend_t backend) {
    const char* name = ggml_backend_name(backend);
    if (ctx->cparams.devices.size()) {
        if (item_in_list(ctx->cparams.devices, name)) {
            ctx->backends.push_back(backend);
        }
    } else {
        ctx->backends.push_back(backend);
    }
}

static bool is_model_split_supported(const llama_model & model) {
    static std::unordered_set<llm_arch> k_supported = {
        LLM_ARCH_LLAMA,
        LLM_ARCH_QWEN3MOE,
        LLM_ARCH_GLM4_MOE,
        LLM_ARCH_MISTRAL3,
        LLM_ARCH_COMMAND_R,
        LLM_ARCH_COHERE2,
        LLM_ARCH_COHERE2_MOE,
        LLM_ARCH_MIMO2,
        LLM_ARCH_QWEN3,
        LLM_ARCH_QWEN3VL,
        LLM_ARCH_HUNYUAN_MOE,
        LLM_ARCH_OPENAI_MOE,
        LLM_ARCH_ERNIE4_5_MOE,
        LLM_ARCH_MINIMAX_M2,
        LLM_ARCH_SEED_OSS,
        LLM_ARCH_STEP35,
        LLM_ARCH_LAGUNA,
        //LLM_ARCH_QWEN3NEXT,
        LLM_ARCH_QWEN35,
        LLM_ARCH_QWEN35MOE,
        LLM_ARCH_GEMMA4,
        LLM_ARCH_DEEPSEEK2,
        LLM_ARCH_GLM_DSA,
        LLM_ARCH_MISTRAL4,
        LLM_ARCH_MELLUM,
        LLM_ARCH_LAGUNA,
    };
    auto it =  k_supported.find(model.arch);
    return it != k_supported.end();
}

struct expert_tensors {
    ggml_tensor * up = nullptr;
    ggml_tensor * gate = nullptr;
    ggml_tensor * down = nullptr;
};

static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_loader & ml, const llama_model & model,
        ggml_type cache_type_k, ggml_type cache_type_v, uint32_t max_ctx_size, int mla_attn, int n_seq_max, int n_ubatch,
        int amb, int worst_case_tokens, bool flash_attn,
        std::vector<expert_tensors> & experts) {
    int n_layer = model.hparams.n_layer;
    std::vector<double> result(n_layer+1, 0);
    std::vector<double> compute(n_layer+1, 0);
    struct mla_tensors {
        ggml_tensor * wk_b  = nullptr;
        ggml_tensor * wv_b  = nullptr;
        ggml_tensor * wkv_b = nullptr;
    };
    std::vector<mla_tensors> mla_tensors;
    bool has_mla = model.is_mla_model();
    if (has_mla) {
        mla_tensors.resize(n_layer);
    }
    experts.resize(n_layer);
    std::vector<size_t> ffn_exps(n_layer, 0), ffn_shexp(n_layer, 0), ffn_dense(n_layer, 0);
    std::vector<size_t> attn(n_layer, 0);
    std::vector<bool> has_layer_norm(n_layer, false);
    size_t ow_size = 0;
    size_t embd_size = 0;
    size_t output_misc_size = 0;
    int gemma4_rope_layer = -1;

    if (model.arch == LLM_ARCH_GEMMA4) {
        for (int il = 0; il < n_layer; ++il) {
            if (!model.hparams.swa_layers[il]) {
                gemma4_rope_layer = il;
                break;
            }
        }
    }

    for (int i = 0; i < ml.n_tensors; ++i) {
        auto t = ml.get_weight(i)->tensor;
        std::string name(t->name);
        auto size = ggml_nbytes(t);
        if (name == "token_embd.weight") {
            embd_size = size;
            continue;
        }
        if (name == "output.weight") {
            ow_size = size;
            continue;
        }
        if (name == "output_norm.weight") {
            continue;
        }
        if (model.arch == LLM_ARCH_GEMMA4) {
            if (name == "per_layer_token_embd.weight" ||
                name == "per_layer_model_proj.weight" ||
                name == "per_layer_proj_norm.weight") {
                output_misc_size += size;
                continue;
            }
            if (name == "rope_freqs.weight") {
                if (gemma4_rope_layer >= 0) {
                    result[gemma4_rope_layer] += size;
                }
                continue;
            }
        }
        if (name == "mtp_pre_proj.weight"  || name == "mtp_post_proj.weight" ||
            name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight") {
            continue;
        }
        auto pos = name.find("blk.");
        if (pos != 0) {
            LLAMA_LOG_WARN("Oops: tensor with strange name %s\n", name.c_str());
            continue;
        }
        pos += 4;
        auto pos1 = name.find('.', pos);
        if (pos1 == std::string::npos) {
            LLAMA_LOG_WARN("Oops: tensor with strange name %s\n", name.c_str());
            continue;
        }
        auto layer_string = name.substr(pos, pos1-pos);
        std::istringstream str(layer_string);
        int il; str >> il;
        if (str.fail()) {
            LLAMA_LOG_WARN("Oops: failed to read layer index from %s for tensor %s\n", layer_string.c_str(), name.c_str());
        }
        if (il < 0 || il >= model.hparams.n_layer) {
            LLAMA_LOG_WARN("Oops: strange layer index %d for tensor %s\n", il, name.c_str());
            continue;
        }
        if (!model.mtp && model.hparams.nextn_predict_layers > 0 && il >= n_layer - model.hparams.nextn_predict_layers) {
            continue;
        }
        result[il] += size;
        if (auto pos = name.rfind(".bias"); pos < name.size() && name.size() - pos == 5) {
            // bias, we don't need to account for those
            continue;
        }
        if (auto pos = name.rfind(".scale"); pos < name.size() && name.size() - pos == 6) {
            // bias, we don't need to account for those
            continue;
        }
        bool is_mla = false;
        if (has_mla) {
            if (name.find("attn_k_b.weight") != std::string::npos) {
                mla_tensors[il].wk_b = t;
                is_mla = true;
            }
            else if (name.find("attn_v_b.weight") != std::string::npos) {
                mla_tensors[il].wv_b = t;
                is_mla = true;
            }
            else if (name.find("attn_kv_b.weight") != std::string::npos) {
                mla_tensors[il].wkv_b = t;
                is_mla = true;
            }
        }
        if (!is_mla) {
            auto ttype = llm_tensor_type(model.arch, name, il);
            if (ttype == LLM_TENSOR_UNKNOWN) {
                LLAMA_LOG_WARN("Oops: got unknown for tensor %s\n", name.c_str());
                continue;
            }
            if (ttype == LLM_TENSOR_FFN_GATE_UP_EXPS || ttype == LLM_TENSOR_FFN_GATE_EXPS || ttype == LLM_TENSOR_FFN_UP_EXPS || ttype == LLM_TENSOR_FFN_DOWN_EXPS) {
                auto size = t->ne[1] * n_ubatch * model.hparams.n_expert_used * sizeof(float);
                ffn_exps[il] += size;
                if (name.find(".bias") == std::string::npos) {
                    if (ttype == LLM_TENSOR_FFN_GATE_UP_EXPS || ttype == LLM_TENSOR_FFN_UP_EXPS) {
                        experts[il].up = t;
                    }
                    else if (ttype == LLM_TENSOR_FFN_GATE_EXPS) {
                        experts[il].gate = t;
                    }
                    else if (ttype == LLM_TENSOR_FFN_DOWN_EXPS) {
                        experts[il].down = t;
                    }
                }
            }
            else if (ttype == LLM_TENSOR_FFN_UP_SHEXP || ttype == LLM_TENSOR_FFN_GATE_SHEXP || ttype == LLM_TENSOR_FFN_DOWN_SHEXP) {
                ffn_shexp[il] += t->ne[1] * n_ubatch * sizeof(float);
            }
            else if (ttype == LLM_TENSOR_FFN_UP || ttype == LLM_TENSOR_FFN_GATE || ttype == LLM_TENSOR_FFN_DOWN) {
                ffn_dense[il] += t->ne[1] * n_ubatch * sizeof(float);
            }
            else if (ttype == LLM_TENSOR_ATTN_Q || ttype == LLM_TENSOR_ATTN_K || ttype == LLM_TENSOR_ATTN_V) {
                attn[il] += t->ne[1] * n_ubatch * sizeof(float);
            }
            else if (ttype == LLM_TENSOR_ATTN_OUT) {
                // This assume FA. With FA, we need to have at the same time Q, K, V and FA(Q, K, V).
                // FA(Q, K, V) has the dimension t->ne[0] x n_ubatch
                attn[il] += t->ne[0] * n_ubatch * sizeof(float);
            }
            else if (ttype == LLM_TENSOR_ATTN_NORM || ttype == LLM_TENSOR_ATTN_NORM_2 || ttype == LLM_TENSOR_ATTN_OUT_NORM || ttype == LLM_TENSOR_ATTN_POST_NORM ||
                     ttype == LLM_TENSOR_FFN_NORM  || ttype == LLM_TENSOR_FFN_POST_NORM || ttype == LLM_TENSOR_LAYER_OUT_NORM) {
                has_layer_norm[il] = true;
            }
            //else {
            //    printf("Unhandled tensor %s of type %d\n", name.c_str(), int(ttype));
            //}
        }
    }
    for (int il = 0; il < n_layer; ++il) {
        auto ffn = std::max(std::max(ffn_exps[il], ffn_shexp[il]), ffn_dense[il]);
        auto inp = model.hparams.n_embd * n_ubatch * sizeof(float);
        if (has_layer_norm[il]) inp *= 2;
        ffn += inp;
        compute[il] = std::max<double>(compute[il], ffn);
    }
    if (has_mla) {
        for (int il = 0; il < n_layer; ++il) {
            auto & mla = mla_tensors[il];
            if (mla.wk_b && mla.wv_b && !mla.wkv_b) {
                auto type = ggml_is_quantized(mla.wk_b->type) ? GGML_TYPE_Q8_0 : mla.wk_b->type;
                auto wkv_b_size = ggml_row_size(type, mla.wv_b->ne[0]) * (mla.wv_b->ne[1] + mla.wk_b->ne[1]) * mla.wv_b->ne[2] * 2;
                result[il] += wkv_b_size;
            }
            else if (mla.wkv_b) {
                if (!mla.wk_b) result[il] += ggml_nbytes(mla.wkv_b)/2;
                if (!mla.wv_b) result[il] += ggml_nbytes(mla.wkv_b)/2;
            }
            if (mla_attn == 3 && mla.wv_b) {
                auto kv_f32_size = (2 * mla.wv_b->ne[1] * mla.wv_b->ne[2] * max_ctx_size * sizeof(float))/(1024.*1024.);
                //printf("wv_b: %ld x %ld x %ld -> %g\n", mla.wv_b->ne[0], mla.wv_b->ne[1], mla.wv_b->ne[2], kv_f32_size);
                int n_head = mla.wv_b->ne[2];
                int n_max_head = n_head;
                if (amb > 0 && kv_f32_size > amb) {
                    n_max_head = 1;
                    for (int niter = 2; niter < n_head; ++niter) {
                        if (n_head % niter == 0 && kv_f32_size/niter <= amb) {
                            n_max_head = n_head/niter;
                            break;
                        }
                    }
                }
                kv_f32_size = 2. * mla.wv_b->ne[1] * n_max_head * max_ctx_size * sizeof(float);
                compute[il] = std::max(compute[il], kv_f32_size);
            }
        }
    }
    if (!ow_size) ow_size = embd_size;
    result[n_layer] = ow_size + output_misc_size;
    LLAMA_LOG_INFO("------------------- Layer sizes:\n");
    double tot_model = 0, tot_cache = 0, max_compute = 0;
    for (int il = 0; il < n_layer; ++il) {
        auto kv_size = model.cache_size(il, cache_type_k, cache_type_v, max_ctx_size, mla_attn, n_seq_max, flash_attn);
        LLAMA_LOG_INFO("Layer %2d: %9.2f, %9.2f, %9.2f   %9.2f  MiB\n", il, result[il]/1024./1024., kv_size/1024./1024., (result[il] + kv_size)/1024./1024., compute[il]/1024./1024.);
        max_compute = std::max(max_compute, compute[il]);
        tot_model += result[il];
        tot_cache += kv_size;
        result[il] += kv_size;
    }
    int n_output = worst_case_tokens > 0 ? worst_case_tokens : n_ubatch;
    size_t output_size = model.hparams.n_vocab * n_output * sizeof(float);
    if (output_size < max_compute) output_size = max_compute;
    output_size -= max_compute;
    LLAMA_LOG_INFO("Layer %2d: %9.2f, %9.2f, %9.2f MiB (output layer)\n", n_layer, result[n_layer]/1024./1024., output_size/1024./1024., (result[n_layer] + output_size)/1024./1024.);
    result[n_layer] += output_size;
    tot_cache += output_size;
    LLAMA_LOG_INFO("--------------------------------------------------------------------------\n");
    LLAMA_LOG_INFO("Total   : %9.2f, %9.2f, %9.2f MiB\n", tot_model/1024./1024., tot_cache/1024./1024., (tot_model + tot_cache)/1024./1024.);
    return std::make_pair(std::move(result), max_compute);
}

// Returns false if cancelled by progress_callback
static bool llm_load_tensors(
        llama_model_loader & ml,
        llama_model & model,
        int n_gpu_layers,
        int & mla_attn,
        enum llama_split_mode split_mode,
        int main_gpu,
        int max_gpu,
        const float * tensor_split,
        ggml_type cache_type_k,
        ggml_type cache_type_v,
        ggml_type extra_output_type,
        uint32_t max_ctx_size,
        int n_seq_max,
        int n_ubatch,
        int amb,
        int fit_margin,
        const int * fit_margin_array,
        int worst_case_tokens,
        bool flash_attn,
        bool use_mlock,
        bool validate_quants,
        bool mtp,
        bool fit,
        bool dry_run,
        llama_progress_callback progress_callback,
        void * progress_callback_user_data) {
    model.t_start_us = ggml_time_us();

    constexpr size_t k_default_mem_margin = 1024 * 1024 * 1024;

    auto & hparams = model.hparams;

    if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) {
        const bool unsupported_gemma_split =
            model.arch == LLM_ARCH_GEMMA4_MTP ||
            model.arch == LLM_ARCH_GEMMA4_ASSISTANT ||
            (model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0);

        if (unsupported_gemma_split) {
            LLAMA_LOG_WARN("\n=========================================================\n");
            LLAMA_LOG_WARN("Split mode 'graph' is not supported for %s\n",
                    (model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ? "Gemma 4 MTP assistant"
                                                      : "this Gemma4 variant");
            LLAMA_LOG_WARN("  => changing split mode to 'layer'\n");
            LLAMA_LOG_WARN("===========================================================\n\n");
            split_mode = LLAMA_SPLIT_MODE_LAYER;
        } else if (!is_model_split_supported(model)) {
            LLAMA_LOG_WARN("\n=======================================================\n");
            LLAMA_LOG_WARN("Split mode 'graph' is not supported for this model\n");
            LLAMA_LOG_WARN("  => changing split mode to 'layer'\n");
            LLAMA_LOG_WARN("=======================================================\n\n");
            split_mode = LLAMA_SPLIT_MODE_LAYER;
        } else {
            if (model.arch == LLM_ARCH_MIMO2 && model.devices.size() > 4 && (max_gpu == 0 || max_gpu > 4)) {
                LLAMA_LOG_WARN("\n================================================================\n");
                LLAMA_LOG_WARN("Split mode 'graph' for Mimo2 does not work with more than 4 GPUs\n");
                LLAMA_LOG_WARN("  => setting max_gpu to 4\n");
                LLAMA_LOG_WARN("================================================================\n\n");
                max_gpu = 4;
            }
        }
    }

    if (iqk_has_fancy_simd()) {
        LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is defined\n");
    } else {
        LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is NOT defined\n");
    }
    if (max_ctx_size == 0) {
        max_ctx_size = model.hparams.n_ctx_train;
    }

    if (fit) {
        if (ml.tensor_buft_overrides) {
            throw std::runtime_error("Manual tensor overrides cannot be used with --fit");
        }
        if (ml.ncmoe > 0) {
            throw std::runtime_error("-ncmoe | --n-cpu-moe cannot be used with --fit");
        }
        n_gpu_layers = 999;
    }

    model.split_mode   = split_mode;
    model.main_gpu     = main_gpu;
    model.max_gpu      = max_gpu;
    model.n_gpu_layers = n_gpu_layers;
    model.mtp          = mtp;

    size_t mem_margin  = fit_margin > 0 ? size_t(fit_margin)*1024*1024 : k_default_mem_margin;
    auto get_mem_margin = [mem_margin, fit_margin_array, n_gpu = int(model.devices.size()), func = __func__] (int gpu) {
        size_t result = mem_margin;
        bool have_margin_override = false;
        if (fit_margin_array) {
            for (int i = 0; ; i += 2) {
                if (fit_margin_array[i] < 0) break;
                if (fit_margin_array[i] == gpu && fit_margin_array[i+1] >= 0) {
                    result = size_t(fit_margin_array[i+1])*1024*1024;
                    have_margin_override = true;
                }
            }
        }
        if (have_margin_override) {
            LLAMA_LOG_INFO("%s: using %0.2f MiB as fit margin for GPU %d\n", func, result/1024./1024., gpu);
        }
        return result;
    };

    const int n_layer     = hparams.n_layer;
    int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
    bool use_mmap_buffer = true;

    // there is very little benefit to offloading the input layer, so always keep it on the CPU
    model.buft_input = llama_default_buffer_type_cpu(true);

    model.buft_layer.resize(n_layer);

    // assign cpu layers
    for (int i = 0; i < i_gpu_start; ++i) {
        model.buft_layer[i] = llama_default_buffer_type_cpu(true);
    }

    std::vector<size_t> device_mem(model.devices.size());
    for (int i = 0; i < int(device_mem.size()); ++i) {
        device_mem[i] = llama_get_device_memory(model, model.devices[i]);
        auto this_margin = get_mem_margin(i);
        if (device_mem[i] > this_margin) {
            device_mem[i] -= this_margin;
        } else {
            LLAMA_LOG_WARN("Free memory %zu MiB on device %d is less the %zu MiB safety margin\n", device_mem[i]/(1024*1024), model.devices[i], this_margin/(1024*1024));
            device_mem[i] = 0;
        }
    }

    if (int device_count = model.devices.size(); device_count > 1) {
        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
        std::vector<float> splits(device_count);
        if (all_zero) {
            // default split, by free memory
            for (int i = 0; i < device_count; ++i) {
                splits[i] = device_mem[i];
            }
        } else {
            std::copy(tensor_split, tensor_split + device_count, splits.begin());
        }

        // sum and normalize the splits to get the split points
        float split_sum = 0.0f;
        for (int i = 0; i < device_count; ++i) {
            split_sum += splits[i];
            splits[i] = split_sum;
        }
        for (int i = 0; i < device_count; ++i) {
            splits[i] /= split_sum;
        }
        model.splits = std::move(splits);
    } else {
        model.splits = { 1.0f };
    }

    int device_count = model.splits.size();
    if (device_count < 2 && (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH)) {
        throw std::runtime_error("It is not possible to use split mode 'graph' with less than 2 devices");
    }
    if (fit && device_count > 1) {
        model.main_gpu = device_count - 1;
    }

    if (model.is_mla_model()) {
        if (model.n_gpu_layers > 0 && model.n_gpu_layers < model.hparams.n_layer && mla_attn != 3) {
            LLAMA_LOG_WARN("=============================================================================\n");
            LLAMA_LOG_WARN("MLA models with ngl < n_layer and split mode graph do not work with mla = %d\n", mla_attn);
            LLAMA_LOG_WARN("    => changing mla to 3\n");
            LLAMA_LOG_WARN("=============================================================================\n");
            mla_attn = 3;
        }
    }

    model.default_layer_device = std::vector<int32_t>(hparams.n_layer+1, device_count-1);
    int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
    std::vector<llama_model_tensor_buft_override> overrides;
    if (device_count > 0) {
        std::vector<expert_tensors> experts;
        auto [layer_sizes, max_compute] = get_layer_sizes(ml, model, cache_type_k, cache_type_v, max_ctx_size, mla_attn, n_seq_max, n_ubatch,
                amb, worst_case_tokens, flash_attn, experts);
        size_t required_mem = 0;
        for (int i = 0; i <= n_layer; ++i) {
            required_mem += layer_sizes[i];
        }
        bool has_experts = false;
        for (int il = 0; il < n_layer; ++il) {
            if (experts[il].down || experts[il].up || experts[il].gate) {
                has_experts = true;
                break;
            }
        }
        size_t available_mem = 0;
        for (int id = 0; id < device_count; ++id) {
            if (device_mem[id] > max_compute) {
                available_mem += device_mem[id] - max_compute;
            } else {
                LLAMA_LOG_WARN("Free memory %zu MiB on device %d is less the required compute buffer size %g MiB\n", device_mem[id]/(1024*1024), id, max_compute/(1024*1024));
            }
        }
        LLAMA_LOG_INFO("Memory required for model tensors + cache: %.f MiB\n", required_mem/(1024.*1024.));
        LLAMA_LOG_INFO("Memory available on all devices - compute: %.f MiB\n", available_mem/(1024.*1024.));
        // Do not adjust the splits if the use has provided their own
        if (required_mem > available_mem && !tensor_split) {
            float sum = 0;
            for (int id = 0; id < device_count; ++id) {
                device_mem[id] = device_mem[id] > max_compute ? device_mem[id] - max_compute : 0;
                sum += device_mem[id];
                model.splits[id] = sum;
            }
            if (sum > 0) {
                for (int id = 0; id < device_count; ++id) {
                    model.splits[id] /= sum;
                }
            }
        }
        if (device_count > 1) {
            int n_last = n_layer;
            if (n_gpu_layers > n_layer) ++n_last;
            double sum = max_compute * device_count;
            for (int i = i_gpu_start; i < n_last; ++i) sum += layer_sizes[i];
            int last = i_gpu_start;
            float loaded_sum = 0;
            for (int id = 0; id < device_count; ++id) {
                loaded_sum += max_compute;
                float split_size = model.splits[id]*sum;
                int il = last;
                for (; il < n_last; ++il) {
                    if (loaded_sum + layer_sizes[il] <= split_size) {
                        model.default_layer_device[il] = id;
                        loaded_sum += layer_sizes[il];
                        if (required_mem <= available_mem) {
                            LLAMA_LOG_INFO("Setting default device in layer %2d to %d\n", il, id);
                        }
                    } else {
                        if (loaded_sum + layer_sizes[il] - split_size < split_size - loaded_sum) {
                            if (required_mem <= available_mem) {
                                LLAMA_LOG_INFO("Setting default device in layer %2d to %d\n", il, id);
                            }
                            model.default_layer_device[il] = id;
                            loaded_sum += layer_sizes[il++];
                        }
                        break;
                    }
                }
                last = il;
            }
        }
        if (fit && required_mem > available_mem) {
            auto buft = ggml_backend_cpu_buffer_type();
            if (has_experts) {
            if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) {
                auto cur_mem = required_mem;
                int n_override = 0;
                for (int il = 0; il < n_layer; ++il) {
                    bool has_experts = false;
                    if (experts[il].down) {
                        cur_mem -= ggml_nbytes(experts[il].down);
                        has_experts = true;
                    }
                    if (experts[il].up) {
                        cur_mem -= ggml_nbytes(experts[il].up);
                        has_experts = true;
                    }
                    if (experts[il].gate) {
                        cur_mem -= ggml_nbytes(experts[il].gate);
                        has_experts = true;
                    }
                    if (has_experts) {
                        LLAMA_LOG_INFO("Adding experts CPU overrides for layer %d\n", il);
                        std::string pattern = "blk\\." + std::to_string(il) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.(weight|scale))";
                        auto & o = overrides.emplace_back();
                        o.pattern = strdup(pattern.c_str());
                        o.buft = buft;
                        ++n_override;
                    }
                    if (cur_mem <= available_mem - max_compute) {
                        LLAMA_LOG_INFO("Estimated memory use for split mode `graph' is %zu MiB after adding %d overrides, which is less than available memory of %zu MiB\n",
                                cur_mem/(1024*1024), n_override, size_t(available_mem - max_compute)/(1024*1024));
                        break;
                    }
                }
                if (cur_mem > available_mem) {
                    if (n_override > 0) {
                        LLAMA_LOG_ERROR("Required memory %zu MiB is still greater than available memory %zu MiB after overriding all MoE tensors to CPU\n",
                                cur_mem/(1024*1024), available_mem/(1024*1024));
                    } else {
                        LLAMA_LOG_ERROR("No MoE tensors in model and required memory %zu MiB greater than available memory %zu MiB.\n",
                                cur_mem/(1024*1024), available_mem/(1024*1024));
                        LLAMA_LOG_ERROR("Note: auto-fit currently only works for MoE models\n");
                    }
                    throw std::runtime_error("Unable to auto-fit model");
                }
                int id = model.default_layer_device[n_layer];
                if (device_mem[id] <= layer_sizes[n_layer]) {
                    LLAMA_LOG_ERROR("Not enough memory in device %d to offload the output layer\n", id);
                    throw std::runtime_error("Unable to auto-fit model");
                }
                device_mem[id] -= layer_sizes[n_layer];
                if (!tensor_split) {
                    float sum = 0;
                    for (int id = 0; id < int(model.splits.size()); ++id) {
                        sum += device_mem[id];
                        model.splits[id] = sum;
                    }
                    if (sum > 0) {
                        LLAMA_LOG_INFO("=== Adjusted device splits for split mode graph:\n");
                        float last_split = 0;
                        for (int id = 0; id < int(model.splits.size()); ++id) {
                            model.splits[id] /= sum;
                            LLAMA_LOG_INFO("Device %2d: %zu MiB -> split = %g\n", id, device_mem[id]/(1024*1024), model.splits[id] - last_split);
                            last_split = model.splits[id];
                        }
                    } else {
                        throw std::runtime_error("Unable to auto-fit model");
                    }
                }
            } else {
                for (int id = 0; id < device_count; ++id) {
                    size_t mem = 0;
                    for (int il = 0; il <= n_layer; ++il) {
                        if (model.default_layer_device[il] == id) {
                            mem += layer_sizes[il];
                        }
                    }
                    size_t cur_mem = mem;
                    int n_override = 0;
                    if (cur_mem > device_mem[id]) {
                        for (int il = n_layer-1; il >= 0; --il) {
                            if (model.default_layer_device[il] != id) continue;
                            bool has_experts = false;
                            if (experts[il].down) {
                                cur_mem -= ggml_nbytes(experts[il].down);
                                has_experts = true;
                            }
                            if (experts[il].up) {
                                cur_mem -= ggml_nbytes(experts[il].up);
                                has_experts = true;
                            }
                            if (experts[il].gate) {
                                cur_mem -= ggml_nbytes(experts[il].gate);
                                has_experts = true;
                            }
                            if (has_experts) {
                                LLAMA_LOG_INFO("Adding experts CPU overrides for layer %d in device %d\n", il, id);
                                std::string pattern = "blk\\." + std::to_string(il) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.(weight|scale))";
                                auto & o = overrides.emplace_back();
                                o.pattern = strdup(pattern.c_str());
                                o.buft = buft;
                                ++n_override;
                            }
                            if (cur_mem <= device_mem[id]) {
                                LLAMA_LOG_INFO("Memory use in device %d is %zu MiB after adding %d overrides, which is less than available memory of %zu MiB\n",
                                        id, cur_mem/(1024*1024) ,n_override, device_mem[id]/(1024*1024));
                                break;
                            }
                        }
                    }
                    if (cur_mem > device_mem[id]) {
                        if (n_override > 0) {
                            LLAMA_LOG_ERROR("Required memory %zu MiB in device %d is still greater than available memory %zu MiB after overriding all MoE tensors to CPU\n",
                                    cur_mem/(1024*1024), id, device_mem[id]/(1024*1024));
                        } else {
                            LLAMA_LOG_ERROR("No MoE tensors in layers assigned to device %d, and required memory %zu MiB greater than available memory %zu MiB.\n",
                                    id, cur_mem/(1024*1024), available_mem/(1024*1024));
                        }
                        throw std::runtime_error("Unable to auto-fit model");
                    }
                }
            }
            } else {
                int id = model.devices.size() - 1;
                for (int il = n_layer; il >= 0; --il) {
                    if (device_mem[id] >= layer_sizes[il]) {
                        device_mem[id] -= layer_sizes[il];
                        model.default_layer_device[il] = id;
                        LLAMA_LOG_INFO("Setting layer %d to device %d\n", il, id);
                    } else {
                        --id;
                        if (id >= 0) {
                            ++il;
                        } else {
                            i_gpu_start = il+1;
                            break;
                        }
                    }
                }
                for (int il = 0; il < i_gpu_start; ++il) {
                    model.default_layer_device[il] = -1;
                    model.buft_layer[il] = llama_default_buffer_type_cpu(true);
                }
            }
        }
    }
    if ((model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && split_mode == LLAMA_SPLIT_MODE_LAYER && device_count > 0 && n_gpu_layers > 0) {
        const int mtp_device = std::clamp(main_gpu, 0, device_count - 1);

        LLAMA_LOG_INFO("%s: Gemma 4 MTP assistant forcing layer placement to GPU %d under layer split\n",
                __func__, mtp_device);

        for (int i = i_gpu_start; i < n_layer; ++i) {
            model.default_layer_device[i] = mtp_device;
        }
        if (n_gpu_layers > n_layer) {
            model.default_layer_device[n_layer] = mtp_device;
        }
    }

    // assign the repeating layers to the devices according to the splits
    if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
        for (int i = i_gpu_start; i < n_layer; ++i) {
            model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[model.default_layer_device[i]]);
        }
        // assign the output layer
        if (n_gpu_layers > n_layer) {
            model.buft_output = llama_default_buffer_type_offload(model, model.devices[model.default_layer_device[n_layer]]);
        } else {
            model.buft_output = llama_default_buffer_type_cpu(true);
        }
    } else {
        ggml_backend_buffer_type_t split_buft;
        if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) {
            split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu]);
            model.split_buft = split_buft;
        } else {
            // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
            split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
        }
        // assign the repeating layers
        for (int i = i_gpu_start; i < n_layer; ++i) {
            auto buft_layer = llama_default_buffer_type_offload(model, model.devices[model.default_layer_device[i]]);
            if (split_mode == LLAMA_SPLIT_MODE_ATTN) {
                int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count,
                        float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
                model.buft_layer[i] = { split_buft, llama_default_buffer_type_offload(model, model.devices[layer_gpu]) };
                LLAMA_LOG_INFO("Layer %d: assigning buft_layer to GPU %d\n", i, layer_gpu);
            } else {
                model.buft_layer[i] = { split_buft, buft_layer };
            }
        }
        // assign the output layer
        if (n_gpu_layers > n_layer) {
            model.buft_output = {
                split_buft,
                llama_default_buffer_type_offload(model, model.devices[model.default_layer_device[n_layer]])
            };
        } else if (n_gpu_layers > 0) {
            // Under partial -ngl with -sm graph, route output to the GPU buffer
            // to avoid synchronization issues between the GPU where the last
            // REDUCE op is performed and the CPU, where the fused RMS norm and
            // mul_mat with the output tensor would be performed if left on the CPU.
            int last_gpu_layer = (int)n_layer - 1;
            int dev = model.default_layer_device.empty() || last_gpu_layer < 0
                    ? 0 : std::max(0, model.default_layer_device[last_gpu_layer]);
            model.buft_output = llama_default_buffer_type_offload(model, dev);
        } else {
            model.buft_output = llama_default_buffer_type_cpu(true);
        }
    }

    if (!overrides.empty()) {
        auto & last = overrides.emplace_back();
        last.pattern = nullptr;
        ml.tensor_buft_overrides = overrides.data();
        model.tensor_overrides = true;
    }

    auto cth = create_tensors_helper_interface::instance(ml, model);

    auto ctx_size = cth->get_ctx_size();
    auto & ctx_map = cth->get_ctx_map();

    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);

    if (hparams.n_expert > 0 && hparams.n_expert_used == 0) {
        throw std::runtime_error("model has expert layers but no expert layers are used");
    }

    use_mmap_buffer = cth->create_tensors();
    if (!use_mmap_buffer) {
        ml.use_mmap = false;
    }

    bool defer_expert_mmap = ml.should_defer_expert_mmaps();
    if (defer_expert_mmap && use_mlock) {
        LLAMA_LOG_WARN("%s: deferred expert loading disabled because mlock keeps mmap ranges resident\n", __func__);
        defer_expert_mmap = false;
    }
    if (defer_expert_mmap && (ml.check_tensors || validate_quants)) {
        LLAMA_LOG_WARN("%s: deferred expert loading disabled because tensor validation would fault expert pages eagerly\n", __func__);
        defer_expert_mmap = false;
    }

    ml.done_getting_tensors();

    // --dry-run skips MAP_POPULATE/WILLNEED — tensor data is never read.
    ml.init_mappings(!defer_expert_mmap && !dry_run, use_mlock ? &model.mlock_mmaps : nullptr, ml.use_thp);
    model.mappings.reserve(ml.mappings.size());

    // create the backend buffers
    std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
    ctx_bufs.reserve(ctx_map.size());

    // Ensure we have enough capacity for the maximum backend buffer we will potentially create
    size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
    model.bufs.reserve(n_max_backend_buffer);

    for (auto & it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
        ggml_context * ctx              = it.second;

        llama_buf_map bufs;
        bufs.reserve(n_max_backend_buffer);

        // only the mmap region containing the tensors in the model is mapped to the backend buffer
        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
        // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
        if (ml.use_mmap && use_mmap_buffer && (buft == llama_default_buffer_type_cpu(true) || buft == ggml_backend_cpu_buffer_type())) {
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                void * addr = nullptr;
                size_t first, last;
                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
                if (first >= last) {
                    continue;
                }
                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
                if (buf == nullptr) {
                    throw std::runtime_error("unable to allocate backend CPU buffer");
                }
                model.bufs.push_back(buf);
                bufs.emplace(idx, buf);
#ifdef GGML_USE_CUDA
                if (n_layer >= n_gpu_layers) {
                    ggml_backend_cuda_register_host_buffer(
                        ggml_backend_buffer_get_base(buf),
                        ggml_backend_buffer_get_size(buf));
                }
#endif
            }
        }
#ifdef GGML_USE_METAL
        else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                const size_t max_size = ggml_get_max_tensor_size(ctx);
                void * addr = nullptr;
                size_t first, last;
                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
                if (first >= last) {
                    continue;
                }
                ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
                if (buf == nullptr) {
                    throw std::runtime_error("unable to allocate backend metal buffer");
                }
                model.bufs.push_back(buf);
                bufs.emplace(idx, buf);
            }
        }
#endif
        else {
            int ntensor = 0;
            for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
                ++ntensor;
            }
            if (ntensor > 0) {
                ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
                if (buf == nullptr) {
                    LLAMA_LOG_ERROR("Failed to allocate buffer type %s\n", ggml_backend_buft_name(buft));
                    throw std::runtime_error("unable to allocate backend buffer");
                }
                model.bufs.push_back(buf);
                if (use_mlock && ggml_backend_buffer_is_host(buf)) {
                    model.mlock_bufs.emplace_back(new llama_mlock);
                    auto & mlock_buf = model.mlock_bufs.back();
                    mlock_buf->init   (ggml_backend_buffer_get_base(buf));
                    mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
                }
                for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                    bufs.emplace(idx, buf);
                }
            }
        }

        if (bufs.empty()) {
            LLAMA_LOG_WARN("No tensors in buffer type %s\n", ggml_backend_buft_name(buft));
            continue;
            //throw std::runtime_error("failed to allocate buffer (1)");
        }

        for (auto & buf : bufs) {
            // indicate that this buffer contains weights
            // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
            ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
        }

        ctx_bufs.emplace_back(ctx, bufs);
    }

    if (llama_supports_gpu_offload()) {
        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));

        LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
        if (n_gpu_layers > (int) hparams.n_layer) {
            LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
        }

        const int max_backend_supported_layers = hparams.n_layer + 1;
        const int max_offloadable_layers       = hparams.n_layer + 1;

        LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
    }

    // print memory requirements
    for (ggml_backend_buffer_t buf : model.bufs) {
        LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
    }

    // populate tensors_by_name
    for (ggml_context * ctx : model.ctxs) {
        for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
            model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
        }
    }

    // load tensor data
    if (!dry_run) {
        for (auto & it : ctx_bufs) {
            ggml_context * ctx = it.first;
            auto & bufs = it.second;
            if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
                return false;
            }
        }

        if (defer_expert_mmap) {
            ml.drop_mmap_expert_pages();
        }
    }

    if (model.is_mla_model()) {
        // -sm graph/attn needs wk_b->extra populated; run prepare even under dry-run.
        const bool graph_mode = (model.split_mode == LLAMA_SPLIT_MODE_GRAPH ||
                                 model.split_mode == LLAMA_SPLIT_MODE_ATTN);
        if (!dry_run || graph_mode) {
            llm_prepare_mla(model, mla_attn);
        }
    }
    if (model.arch == LLM_ARCH_GEMMA4) {
        llm_scale_gate_inp_s(model, use_mmap_buffer);
    }
    if ((model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) && extra_output_type != GGML_TYPE_COUNT) {
        llm_requantize_output_tensor(model, extra_output_type);
    }

    if (use_mmap_buffer) {
        for (auto & mapping : ml.mappings) {
            model.mappings.emplace_back(std::move(mapping));
        }
    }

    if (!ml.use_mmap) {
        int n_modified = 0;
        for (auto& it : model.tensors_by_name) {
            if (ggml_backend_buffer_is_host(it.second->buffer)) {
                if (iqk_modify_tensor(it.second)) ++n_modified;
            }
        }
        if (n_modified > 0) LLAMA_LOG_INFO("============ Modified %d tensors\n", n_modified);
    }

    if (validate_quants) {
        int nbad = 0;
        for (auto& it : model.tensors_by_name) {
            if (ggml_backend_buffer_is_host(it.second->buffer)) {
                if (!iqk_validate_tensor(it.second)) ++nbad;
            }
        }
        if (nbad > 0) {
            LLAMA_LOG_ERROR("Found %d bad tensors in model\n", nbad);
            throw std::runtime_error("Bad tensors in model");
        }
    }

    if (defer_expert_mmap && !dry_run) {
        LLAMA_LOG_INFO("%s: dense parameters loaded in %.2fs (%.2f GiB), expert parameters deferred (%.2f GiB)\n",
                __func__,
                (ggml_time_us() - model.t_start_us) / 1000000.0,
                ml.expert_tensor_index.dense_bytes / 1024.0 / 1024.0 / 1024.0,
                ml.expert_tensor_index.deferred_bytes / 1024.0 / 1024.0 / 1024.0);
    }

    if (!ml.use_mmap && ml.repack_tensors) {
        int n_repacked = 0;
        for (auto& it : model.tensors_by_name) {
            if (ggml_backend_buffer_is_host(it.second->buffer)) {
                auto orig_type = it.second->type;
                if (it.second->view_src) continue;
                iqk_repack_tensor(it.second);
                if (it.second->type != orig_type) ++n_repacked;
            }
        }
        if (n_repacked > 0) LLAMA_LOG_INFO("============ Repacked %d tensors\n", n_repacked);
    }

    if (model.arch == LLM_ARCH_BITNET) {
        auto set_scale = [] (ggml_tensor * w, ggml_tensor * s) {
            if (!s) {
                float one = 1;
                std::memcpy(w->op_params, &one, sizeof(one));
                return;
            }
            float scale = 1;
            if (ggml_backend_buffer_is_host(s->buffer)) {
                scale = *(const float *)s->data;
            } else {
                ggml_backend_tensor_get(s, &scale, 0, sizeof(float));
            }
            std::memcpy(w->op_params, &scale, sizeof(scale));
        };
        for (auto& l : model.layers) {
            set_scale(l.ffn_up, l.ffn_up_scale);
            set_scale(l.ffn_gate, l.ffn_gate_scale);
            set_scale(l.ffn_down, l.ffn_down_scale);
            set_scale(l.wq, l.wq_scale);
            set_scale(l.wk, l.wk_scale);
            set_scale(l.wv, l.wv_scale);
            set_scale(l.wo, l.wo_scale);
        }
    }

    if (!overrides.empty()) {
        for (auto & o : overrides) {
            if (o.pattern) free(const_cast<char*>(o.pattern));
        }
        ml.tensor_buft_overrides = nullptr;
    }

    // loading time will be recalculate after the first eval, so
    // we take page faults deferred by mmap() into consideration
    model.t_load_us = ggml_time_us() - model.t_start_us;
    return true;
}

// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
    try {
        llama_model_loader ml(fname, params.ncmoe, params.use_mmap, params.check_tensors,
                params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps,
                params.defer_experts,
                params.kv_overrides, params.tensor_buft_overrides);

        model.hparams.vocab_only = params.vocab_only;

        model.mtp = params.mtp;

        try {
            llm_load_arch(ml, model);
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
        }
        try {
            llm_load_hparams(ml, model);
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
        }
        if (params.defer_experts && params.use_mmap) {
#ifdef __linux__
            ml.build_expert_tensor_index(model.hparams);
#else
            LLAMA_LOG_WARN("%s: deferred expert loading is only supported on Linux; ignoring defer_experts\n", __func__);
#endif
        }
        try {
            LLM_KV kv(model.arch);
            model.vocab.load(ml, kv);
        } catch(const std::exception & e) {
            throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
        }

        llm_load_print_meta(ml, model);

        if (model.vocab.get_type() != LLAMA_VOCAB_TYPE_NONE &&
            model.hparams.n_vocab != model.vocab.n_tokens()) {
            throw std::runtime_error("vocab size mismatch");
        }

        if (params.vocab_only) {
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
            return 0;
        }

#ifdef GGML_USE_KOMPUTE
        if (params.n_gpu_layers > 0 && (
            !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
            || !(
                model.ftype == LLAMA_FTYPE_ALL_F32 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
            )
        )) {
            // TODO(cebtenzzre): propagate this error outside of llama_model_load_from_file
            LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
            params.n_gpu_layers = 0;
        }
#endif

        if (!llm_load_tensors(
            ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
            params.type_k, params.type_v, params.extra_output_type,
            params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin, params.fit_margin_array,
            params.worst_graph_tokens, params.flash_attn,
            params.use_mlock, params.validate_quants, params.mtp, params.fit, params.dry_run,
            params.progress_callback, params.progress_callback_user_data
        )) {
            return -2;
        }
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
        return -1;
    }

    return 0;
}

//
// llm_build
//

static void llama_set_k_shift(llama_context & lctx) {
    const int64_t kv_size = lctx.kv_self.size;

    assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));

    int32_t * data = (int32_t *) lctx.inp_K_shift->data;

    for (int i = 0; i < kv_size; ++i) {
        data[i] = lctx.kv_self.cells[i].delta;
    }
}

static void llama_set_s_copy(llama_context & lctx) {
    const int64_t kv_size = lctx.kv_self.size;

    assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));

    int32_t * data = (int32_t *) lctx.inp_s_copy->data;

    for (int i = 0; i < kv_size; ++i) {
        data[i] = lctx.kv_self.cells[i].src;
    }
}

static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
    // TODO move to hparams if a T5 variant appears that uses a different value
    const int64_t max_distance = 128;

    if (bidirectional) {
        n_buckets >>= 1;
    }

    const int64_t max_exact = n_buckets >> 1;

    int32_t relative_position = x - y;
    int32_t relative_bucket = 0;
    if (bidirectional) {
        relative_bucket += (relative_position > 0) * n_buckets;
        relative_position = abs(relative_position);
    } else {
        relative_position = -std::min<int32_t>(relative_position, 0);
    }
    int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
    relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
    relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
    return relative_bucket;
}

static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
    //
    // set input data
    //

    const auto & hparams = lctx.model.hparams;
    const auto & cparams = lctx.cparams;
    const auto & kv_self = lctx.kv_self;

    if (batch.token && lctx.inp_tokens) {
#if IK_PRINT_TIMING == 2
        auto tim1 = ggml_time_us();
#endif
        const int64_t n_tokens = batch.n_tokens;

        ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
#if IK_PRINT_TIMING == 2
        auto tim2 = ggml_time_us();
        printf("set_inputs(token): %d us\n", int(tim2-tim1));
#endif
    }

    if (batch.embd) {
#if IK_PRINT_TIMING == 2
        auto tim1 = ggml_time_us();
#endif
        const int64_t n_embd   = hparams.n_embd;
        const int64_t n_tokens = batch.n_tokens;

        ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
#if IK_PRINT_TIMING == 2
        auto tim2 = ggml_time_us();
        printf("set_inputs(embd): %d us\n", int(tim2-tim1));
#endif
    }

    if (batch.pos && lctx.inp_pos) {
#if IK_PRINT_TIMING == 2
        auto tim1 = ggml_time_us();
#endif
        const int64_t n_tokens = batch.n_tokens;
        const int n_pos_per_embd =  hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
        ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(lctx.inp_pos));
#if IK_PRINT_TIMING == 2
        auto tim2 = ggml_time_us();
        printf("set_inputs(pos): %d us\n", int(tim2-tim1));
#endif
    }

    if (lctx.inp_pos && lctx.inp_scale) {
#if IK_PRINT_TIMING == 2
        auto tim1 = ggml_time_us();
#endif
        int n_tokens = batch.n_tokens;
        GGML_ASSERT(ggml_nelements(lctx.inp_scale) >= n_tokens);
        if (int(lctx.scale_data.size()) < n_tokens) lctx.scale_data.resize(n_tokens);
        int n_pos_per_token = 1;
        for (int i = 0; i < n_tokens; ++i) {
            lctx.scale_data[i] = std::log(std::floor((batch.pos[i] + 1.0f) / hparams.n_attn_temp_floor_scale) + 1.0f) * hparams.f_attn_temp_scale + 1.0f;
        }
        ggml_backend_tensor_set(lctx.inp_scale, lctx.scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(lctx.inp_scale));
#if IK_PRINT_TIMING == 2
        auto tim2 = ggml_time_us();
        printf("set_inputs(scale): %d us\n", int(tim2-tim1));
#endif
    }

    if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
#if IK_PRINT_TIMING == 2
        auto tim1 = ggml_time_us();
#endif
        const int64_t n_tokens = batch.n_tokens;
        if (n_tokens > 1 && !cparams.mtp && lctx.n_outputs < n_tokens) {
            GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
        }

        if (lctx.inp_out_ids && lctx.inp_out_ids->buffer) {
        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
        int32_t * data = (int32_t *) lctx.inp_out_ids->data;

        if (lctx.n_outputs == n_tokens) {
            for (int i = 0; i < n_tokens; ++i) {
                data[i] = i;
            }
        } else if (batch.logits) {
            int32_t n_outputs = 0;
            for (int i = 0; i < n_tokens; ++i) {
                if (batch.logits[i]) {
                    data[n_outputs++] = i;
                }
            }
            // the graph needs to have been passed the correct number of outputs
            GGML_ASSERT(lctx.n_outputs == n_outputs);
        } else if (lctx.n_outputs == 1) {
            // only keep last output
            data[0] = n_tokens - 1;
        } else {
            GGML_ASSERT(lctx.n_outputs == 0);
        }
        }
#if IK_PRINT_TIMING == 2
        auto tim2 = ggml_time_us();
        printf("set_inputs(outputs): %d us\n", int(tim2-tim1));
#endif
    }

    GGML_ASSERT(
        // (!a || b) is a logical implication (a -> b)
        // !hparams.causal_attn -> !cparams.causal_attn
        (hparams.causal_attn || !cparams.causal_attn) &&
        "causal attention is not supported by this model"
    );

    if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
#if IK_PRINT_TIMING == 2
        auto tim1 = ggml_time_us();
#endif
        // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
        if (cparams.causal_attn && !lctx.is_encoding) {
            const llama_kv_cache & mask_kv_self =
                ((lctx.model.arch == LLM_ARCH_GEMMA4_MTP || lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && lctx.mtp_target_ctx != nullptr)
                    ? lctx.mtp_target_ctx->kv_self
                    : kv_self;
            const int64_t n_kv     = mask_kv_self.n;
            const int64_t n_tokens = batch.n_tokens;


            float * data     = nullptr;
            float * data_swa = nullptr;
            ggml_half * data_f16     = nullptr;
            ggml_half * data_swa_f16 = nullptr;

            if (lctx.inp_KQ_mask) {
                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
                if (cparams.flash_attn) {
                    data_f16 = (ggml_half *)lctx.inp_KQ_mask->data;
                } else {
                    data = (float *) lctx.inp_KQ_mask->data;
                }
            }

            if (lctx.inp_KQ_mask_swa) {
                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
                if (cparams.flash_attn) {
                    data_swa_f16 = (ggml_half *) lctx.inp_KQ_mask_swa->data;
                } else {
                    data_swa = (float *) lctx.inp_KQ_mask_swa->data;
                }
            }

            auto noalibi_f16 = [&mask_kv_self, &hparams, n_kv, data_f16, data_swa_f16] (int j, llama_pos pos, llama_seq_id seq_id, int first, int last) {
                ggml_half h_inf  = ggml_fp32_to_fp16(-INFINITY);
                ggml_half h_zero = ggml_fp32_to_fp16(0.f);
                for (int i = first; i < last; ++i) {
                    ggml_half h = !mask_kv_self.cells[i].has_seq_id(seq_id) || mask_kv_self.cells[i].pos > pos ? h_inf : h_zero;
                    if (data_f16) data_f16[j*n_kv + i] = h;
                    if (data_swa_f16) {
                        if (h != h_inf) {
                            if (hparams.n_attn_chunk) {
                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
                                if (mask_kv_self.cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
                                    h = h_inf;
                                }
                            } else {
                                if (pos - mask_kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
                                    h = h_inf;
                                }
                            }
                        }
                        data_swa_f16[j*n_kv + i] = h;
                    }
                }
            };

            if (n_kv >= 1024 && n_tokens >= 32) {
                int n_thread = std::max(1, int(std::thread::hardware_concurrency()/2));
                int npt = (n_kv + n_thread - 1)/n_thread;
                auto compute = [&batch, &mask_kv_self, &hparams, &cparams, &noalibi_f16, n_tokens, n_kv, npt, data, data_swa, data_f16, data_swa_f16] (int ith) {
                    int first = ith * npt;
                    int last  = std::min(int(n_kv), first + npt);
                    if (last <= first) return;
                    for (int j = 0; j < n_tokens; ++j) {
                        const llama_pos    pos    = batch.pos[j];
                        const llama_seq_id seq_id = batch.seq_id[j][0];

                        if (!hparams.use_alibi && cparams.flash_attn) {
                            noalibi_f16(j, pos, seq_id, first, last);
                            continue;
                        }

                        for (int i = first; i < last; ++i) {
                            float f;
                            if (!mask_kv_self.cells[i].has_seq_id(seq_id) || mask_kv_self.cells[i].pos > pos) {
                                f = -INFINITY;
                            } else {
                                if (hparams.use_alibi) {
                                    f = -std::abs(mask_kv_self.cells[i].pos - pos);
                                } else {
                                    f = 0.0f;
                                }
                            }

                            if (data) {
                                data[j*n_kv + i] = f;
                            }
                            if (data_f16) {
                                data_f16[j*n_kv + i] = ggml_fp32_to_fp16(f);
                            }

                            // may need to cut off old tokens for sliding window
                            if (data_swa || data_swa_f16) {
                                if (f > -INFINITY) {
                                    if (hparams.n_attn_chunk) {
                                        llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
                                        if (mask_kv_self.cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
                                            f = -INFINITY;
                                        }
                                    } else {
                                        if (pos - mask_kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
                                            f = -INFINITY;
                                        }
                                    }
                                }
                                if (data_swa) {
                                    data_swa[j*n_kv + i] = f;
                                }
                                if (data_swa_f16) {
                                    data_swa_f16[j*n_kv + i] = ggml_fp32_to_fp16(f);
                                }
                            }
                        }
                    }
                };
                std::vector<std::thread> workers(n_thread-1);
                int it = 0;
                for (auto& w : workers) w = std::thread(compute, it++);
                compute(it);
                for (auto& w : workers) w.join();
                int64_t n_tokens_padded = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD);
                if (n_tokens_padded > n_tokens) {
                    if (data) {
                        std::fill(data + int64_t(n_tokens)*n_kv, data + n_tokens_padded*n_kv, -INFINITY);
                    }
                    if (data_f16) {
                        ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
                        std::fill(data_f16 + int64_t(n_tokens)*n_kv, data_f16 + n_tokens_padded*n_kv, h_inf);
                    }
                    if (data_swa) {
                        std::fill(data_swa + int64_t(n_tokens)*n_kv, data_swa + n_tokens_padded*n_kv, -INFINITY);
                    }
                    if (data_swa_f16) {
                        ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
                        std::fill(data_swa_f16 + int64_t(n_tokens)*n_kv, data_swa_f16 + n_tokens_padded*n_kv, h_inf);
                    }
                }
            }
            else {

            // For causal attention, use only the previous KV cells
            // of the correct sequence for each token of the batch.
            // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
            for (int h = 0; h < 1; ++h) {
                for (int j = 0; j < n_tokens; ++j) {
                    const llama_pos    pos    = batch.pos[j];
                    const llama_seq_id seq_id = batch.seq_id[j][0];

                    if (!hparams.use_alibi && cparams.flash_attn) {
                        noalibi_f16(j, pos, seq_id, 0, n_kv);
                        continue;
                    }

                    for (int i = 0; i < n_kv; ++i) {
                        float f;
                        if (!mask_kv_self.cells[i].has_seq_id(seq_id) || mask_kv_self.cells[i].pos > pos) {
                            f = -INFINITY;
                        } else {
                            if (hparams.use_alibi) {
                                f = -std::abs(mask_kv_self.cells[i].pos - pos);
                            } else {
                                f = 0.0f;
                            }
                        }

                        if (data) {
                            data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
                        }
                        if (data_f16) {
                            data_f16[h*(n_kv*n_tokens) + j*n_kv + i] = ggml_fp32_to_fp16(f);
                        }

                        // may need to cut off old tokens for sliding window
                        if (data_swa || data_swa_f16) {
                            if (hparams.n_attn_chunk) {
                                llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
                                if (mask_kv_self.cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
                                    f = -INFINITY;
                                }
                            } else {
                                if (pos - mask_kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
                                    f = -INFINITY;
                                }
                            }
                            if (data_swa) {
                                data_swa[h*(n_kv*n_tokens) + j*n_kv + i] = f;
                            }
                            if (data_swa_f16) {
                                data_swa_f16[h*(n_kv*n_tokens) + j*n_kv + i] = ggml_fp32_to_fp16(f);
                            }
                        }
                    }
                }

                int64_t n_tokens_padded = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD);
                if (n_tokens_padded > n_tokens) {
                    if (data) {
                        std::fill(data + int64_t(n_tokens)*n_kv, data + n_tokens_padded*n_kv, -INFINITY);
                    }
                    if (data_f16) {
                        ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
                        std::fill(data_f16 + int64_t(n_tokens)*n_kv, data_f16 + n_tokens_padded*n_kv, h_inf);
                    }
                    if (data_swa) {
                        std::fill(data_swa + int64_t(n_tokens)*n_kv, data_swa + n_tokens_padded*n_kv, -INFINITY);
                    }
                    if (data_swa_f16) {
                        ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
                        std::fill(data_swa_f16 + int64_t(n_tokens)*n_kv, data_swa_f16 + n_tokens_padded*n_kv, h_inf);
                    }
                }
            }
            }
#if IK_PRINT_TIMING == 2
            auto tim2 = ggml_time_us();
            printf("set_inputs(mask1): %d us\n", int(tim2-tim1));
#endif
        } else {
            // when using kv cache, the mask needs to match the kv cache size
            const int64_t n_tokens = batch.n_tokens;
            const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;

            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));

            float * data     = nullptr;
            float * data_swa = nullptr;
            ggml_half * data_f16     = nullptr;
            ggml_half * data_swa_f16 = nullptr;

            if (lctx.inp_KQ_mask) {
                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
                if (cparams.flash_attn) {
                    data_f16 = (ggml_half *)lctx.inp_KQ_mask->data;
                } else {
                    data = (float *) lctx.inp_KQ_mask->data;
                }
            }

            if (lctx.inp_KQ_mask_swa) {
                GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
                if (cparams.flash_attn) {
                    data_swa_f16 = (ggml_half *) lctx.inp_KQ_mask_swa->data;
                } else {
                    data_swa = (float *) lctx.inp_KQ_mask_swa->data;
                }
            }

            for (int h = 0; h < 1; ++h) {
                for (int j = 0; j < n_tokens; ++j) {
                    const llama_seq_id seq_id = batch.seq_id[j][0];

                    for (int i = 0; i < n_tokens; ++i) {
                        float f = -INFINITY;
                        for (int s = 0; s < batch.n_seq_id[i]; ++s) {
                            if (batch.seq_id[i][s] == seq_id) {
                                if (hparams.use_alibi) {
                                    f = -std::abs(batch.pos[i] - batch.pos[j]);
                                } else {
                                    f = 0.0f;
                                }
                                break;
                            }
                        }

                        if (data) {
                            data[h*(n_tokens*n_tokens) + j*n_stride + i] = f;
                        }
                        if (data_f16) {
                            data_f16[h*(n_tokens*n_tokens) + j*n_tokens + i] = ggml_fp32_to_fp16(f);
                        }

                        if (data_swa || data_swa_f16) {
                            if (hparams.n_attn_chunk) {
                                llama_pos pos_chunk_start = (batch.pos[i] / hparams.n_attn_chunk) * hparams.n_attn_chunk;
                                if (lctx.kv_self.cells[i].pos < pos_chunk_start || batch.pos[i] < pos_chunk_start) {
                                    f = -INFINITY;
                                }
                            } else {
                                if (batch.pos[i] - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
                                    f = -INFINITY;
                                }
                            }
                            if (data_swa) {
                                data_swa[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
                            }
                            if (data_swa_f16) {
                                data_swa_f16[h*(n_tokens*n_tokens) + j*n_tokens + i] = ggml_fp32_to_fp16(f);
                            }
                        }
                    }

                    if (data) {
                        for (int i = n_tokens; i < n_stride; ++i) {
                            data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY;
                        }
                    }
                    if (data_f16) {
                        auto h_inf = ggml_fp32_to_fp16(-INFINITY);
                        for (int i = n_tokens; i < n_stride; ++i) {
                            data_f16[h*(n_tokens*n_tokens) + j*n_stride + i] = h_inf;
                        }
                    }
                }
            }
#if IK_PRINT_TIMING == 2
            auto tim2 = ggml_time_us();
            printf("set_inputs(mask2): %d us\n", int(tim2-tim1));
#endif
        }
    }

    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(lctx.inp_mean);
        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));

        float * data = (float *) lctx.inp_mean->data;
        memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));

        std::vector<uint64_t> sum(n_tokens, 0);
        for (int i = 0; i < n_tokens; ++i) {
            const llama_seq_id seq_id = batch.seq_id[i][0];

            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");

            sum[seq_id] += 1;
        }

        std::vector<float> div(n_tokens, 0.0f);
        for (int i = 0; i < n_tokens; ++i) {
            const uint64_t s = sum[i];
            if (s > 0) {
                div[i] = 1.0f/float(s);
            }
        }

        for (int i = 0; i < n_tokens; ++i) {
            const llama_seq_id seq_id = batch.seq_id[i][0];
            data[seq_id*n_tokens + i] = div[seq_id];
        }
    }

    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(lctx.inp_cls);
        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));

        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));

        for (int i = 0; i < n_tokens; ++i) {
            const llama_seq_id seq_id = batch.seq_id[i][0];
            const llama_pos    pos    = batch.pos[i];

            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");

            if (pos == 0) {
                data[seq_id] = i;
            }
        }
    }

    if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(lctx.inp_cls);
        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));

        uint32_t * data = (uint32_t *) lctx.inp_cls->data;
        memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));

        std::vector<int> last_pos(n_tokens, -1);
        std::vector<int> last_row(n_tokens, -1);

        for (int i = 0; i < n_tokens; ++i) {
            const llama_seq_id seq_id = batch.seq_id[i][0];
            const llama_pos    pos    = batch.pos[i];

            GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");

            if (pos >= last_pos[seq_id]) {
                last_pos[seq_id] = pos;
                last_row[seq_id] = i;
            }
        }

        for (int i = 0; i < n_tokens; ++i) {
            if (last_row[i] >= 0) {
                data[i] = last_row[i];
            }
        }
    }

    if (kv_self.recurrent) {
        const int64_t n_kv = kv_self.n;

        if (lctx.inp_s_mask) {
            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
            float * data = (float *) lctx.inp_s_mask->data;

            // states which are not affected by the current batch are left untouched
            for (int i = 0; i < n_kv; ++i) {
                llama_seq_id    seq_id       = i + lctx.kv_self.head;
                llama_kv_cell & kv_cell      = lctx.kv_self.cells[seq_id];
                bool            has_self_seq = kv_cell.has_seq_id(seq_id);

                data[i] = (float) has_self_seq;

                // ensure current sequences will be kept
                if (!has_self_seq && kv_cell.pos >= 0) {
                    kv_cell.seq_id.insert(seq_id);
                }
            }
        }
        // For Mamba (and other recurrent architectures),
        // update the correct state(s)/sequence(s) for each token of the batch.
        // Like with the KQ_mask, if a token in the batch has multiple sequences,
        // they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv).
        if (lctx.inp_s_seq) {
            const int64_t n_tokens = batch.n_tokens;

            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer));
            int32_t * data = (int32_t *) lctx.inp_s_seq->data;

            for (int j = 0; j < n_tokens; ++j) {
                const int32_t n_seq = batch.n_seq_id[j];
                GGML_ASSERT(0 < n_seq); // a token should be part of at least 1 sequence

                for (int i = 0; i < n_kv; ++i) {
                    if (i < n_seq) {
                        // for this type of model, the head is the minimum seq_id of the batch
                        data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head;
                    } else {
                        data[j*n_kv + i] = -1;
                    }
                }
            }
        }
    }

    if (lctx.inp_s_seq_qnext) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq_qnext->buffer));
        int32_t * data = (int32_t *) lctx.inp_s_seq_qnext->data;

        for (int64_t j = 0; j < n_tokens; ++j) {
            // qwen3next linear-attention path uses a single local recurrent state slot.
            data[j] = 0;
        }
    }

    if (lctx.inp_pos_bucket) {
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));

        int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;

        if (!lctx.is_encoding) {
            const int64_t n_kv = kv_self.n;
            for (int h = 0; h < 1; ++h) {
                for (int j = 0; j < n_tokens; ++j) {
                    for (int i = 0; i < n_kv; ++i) {
                        data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
                    }
                }
            }
        } else {
            for (int h = 0; h < 1; ++h) {
                for (int j = 0; j < n_tokens; ++j) {
                    for (int i = 0; i < n_tokens; ++i) {
                        data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(batch.pos[i], batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
                    }
                }
            }
        }
    }

    if (!lctx.is_encoding && lctx.inp_embd_enc) {
        assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
        assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());

        ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
    }

    if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
        const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
        const int64_t n_tokens = batch.n_tokens;

        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));

        float * data = (float *) lctx.inp_KQ_mask_cross->data;

        for (int h = 0; h < 1; ++h) {
            for (int j = 0; j < n_tokens; ++j) {
                for (int i = 0; i < n_output_enc; ++i) {
                    float f = -INFINITY;
                    for (int s = 0; s < batch.n_seq_id[j]; ++s) {
                        const llama_seq_id seq_id = batch.seq_id[j][s];
                        if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
                            f = 0.0f;
                        }
                    }
                    data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
                }
            }

            for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
                for (int j = 0; j < n_output_enc; ++j) {
                    data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
                }
            }
        }
    }
}

// Make sure enough space is available for outputs.
// Returns max number of outputs for which space was reserved.
static uint32_t llama_output_embd_width(const llama_context & lctx) {
    return llama_mtp_state_n_embd(&lctx);
}

static bool llama_context_has_mtp_outputs(const llama_context & lctx) {
    return lctx.cparams.mtp && (
        lctx.model.hparams.nextn_predict_layers > 0 ||
        lctx.model.arch == LLM_ARCH_GEMMA4 ||
        lctx.model.arch == LLM_ARCH_GEMMA4_MTP ||
        lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT);
}

static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
    const auto & cparams = lctx.cparams;
    const auto & hparams = lctx.model.hparams;

    const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);

    const auto n_batch = cparams.n_batch;
    const auto n_vocab = hparams.n_vocab;
    const auto n_embd  = llama_output_embd_width(lctx);

    // TODO: use a per-batch flag for logits presence instead
    const bool has_mtp = llama_context_has_mtp_outputs(lctx);
    const bool has_logits = !cparams.embeddings || has_mtp;
    const bool has_embd   = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)) || has_mtp;

    const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
    const size_t embd_size   = has_embd   ?  n_embd*n_outputs_max : 0;

    if (lctx.output_ids.empty()) {
        // init, never resized afterwards
        lctx.output_ids.resize(n_batch);
    }

    const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
    const size_t new_size  = (logits_size + embd_size) * sizeof(float);

    // alloc only when more than the current capacity is required
    // TODO: also consider shrinking the buffer
    if (!lctx.buf_output || prev_size < new_size) {
        if (lctx.buf_output) {
#ifndef NDEBUG
            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
            ggml_backend_buffer_free(lctx.buf_output);
            lctx.buf_output = nullptr;
            lctx.logits = nullptr;
            lctx.embd = nullptr;
        }

        lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
        if (lctx.buf_output == nullptr) {
            LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
            return 0;
        }
    }

    float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);

    lctx.logits = has_logits ? output_base               : nullptr;
    lctx.embd   = has_embd   ? output_base + logits_size : nullptr;

    lctx.output_size = n_outputs_max;
    lctx.logits_size = logits_size;
    lctx.embd_size   = embd_size;

    // set all ids as invalid (negative)
    std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);

    if (false) {
        // What is the purpose of clearing the output buffer?
        // When we are getting embeddings for models with large vocabularies this
        // costs a non-negligible amount of time.
        // The output buffer will get populated with meaningful results in llama_decode
        // If it doesn't, the solution is not to just blindly zero the buffer
        // but to fix the bug that causes meaningless results.
        if (has_mtp) {
            // MTP uses a large output footprint, clear only the active region.
            const size_t clear_size = (logits_size + embd_size) * sizeof(float);
            if (clear_size > 0 && output_base) {
                memset(output_base, 0, clear_size);
            }
        } else {
            ggml_backend_buffer_clear(lctx.buf_output, 0);
        }
    }

    lctx.n_outputs = 0;

    return n_outputs_max;
}


static void llama_graph_compute(
        llama_context & lctx,
          ggml_cgraph * gf,
                  int   n_threads) {
#ifdef GGML_USE_METAL
    if (ggml_backend_is_metal(lctx.backend_metal)) {
        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
    }
#endif

    if (lctx.backend_cpu != nullptr) {
        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
        ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
    }
#ifdef GGML_USE_BLAS
    if (lctx.backend_blas != nullptr) {
        ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
    }
#endif

    ggml_backend_sched_graph_compute_async(lctx.sched, gf);

    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
}

static bool prepare_mtp_graph_inputs(
        struct llama_context & lctx,
        uint32_t cur_token,
        uint32_t n_tokens,
        uint32_t n_tokens_all) {
    ggml_tensor * dst = lctx.inp_mtp_states;
    const size_t expected_floats = ggml_nbytes(dst) / sizeof(float);
    const size_t total_floats = lctx.draft_input_hidden_state_n_floats;
    const float * src = lctx.draft_input_hidden_state;

    if (!src) {
        LLAMA_LOG_ERROR("%s: Source hidden state is null\n", __func__);
        return false;
    }

    if (total_floats != expected_floats) {
        if (n_tokens_all == 0 || total_floats % n_tokens_all != 0) {
            LLAMA_LOG_ERROR("%s: Source hidden state size mismatch (have %zu floats, need %zu)\n",
                    __func__, total_floats, expected_floats);
            return false;
        }

        const size_t row_width = total_floats / n_tokens_all;
        const size_t slice_floats = row_width * n_tokens;
        const size_t slice_offset = (size_t) cur_token * row_width;

        if (slice_floats != expected_floats ||
                slice_offset > total_floats ||
                total_floats - slice_offset < expected_floats) {
            LLAMA_LOG_ERROR("%s: Source hidden state size mismatch (have %zu floats, need %zu)\n",
                    __func__, total_floats, expected_floats);
            return false;
        }

        src += slice_offset;
    }

    if (src == nullptr) {
        LLAMA_LOG_ERROR("%s: Source hidden state size mismatch (have %zu floats, need %zu)\n",
                __func__, total_floats, expected_floats);
        return false;
    }

    ggml_backend_tensor_set(dst, src, 0, ggml_nbytes(dst));
    return true;
}

// decode a batch of tokens by evaluating the transformer
//
//   - lctx:      llama context
//   - batch:     batch to evaluate
//
// return 0 on success
// return positive int on warning
// return negative int on error
//
static int llama_decode_internal(
         llama_context & lctx,
           llama_batch   batch_all) { // TODO: rename back to batch

    lctx.is_encoding = false;
    const uint32_t n_tokens_all = batch_all.n_tokens;

    if (n_tokens_all == 0) {
        LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
        return -1;
    }
#if IK_PRINT_TIMING > 2
    printf("===== %s: %ld\n", __func__, ggml_time_us());
#endif

    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT

    GGML_ASSERT(n_tokens_all <= cparams.n_batch);

    GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");

    if (lctx.t_compute_start_us == 0) {
        lctx.t_compute_start_us = ggml_time_us();
    }
    lctx.n_queued_tokens += n_tokens_all;

    auto & kv_self = lctx.kv_self;

    const int64_t n_embd  = hparams.n_embd;
    const int64_t n_vocab = hparams.n_vocab;

    uint32_t n_outputs = 0;
    uint32_t n_outputs_prev = 0;
    uint32_t n_outputs_embd = 0;
    uint32_t n_outputs_prev_embd = 0;

    const auto n_ubatch = cparams.n_ubatch;

    // TODO: simplify or deprecate
    std::vector<llama_pos> pos;
    std::vector<int32_t>                   n_seq_id;
    std::vector<llama_seq_id *>            seq_id_arr;
    std::vector<std::vector<llama_seq_id>> seq_id;

    // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
    const bool has_mtp = llama_context_has_mtp_outputs(lctx);
    const uint32_t n_embd_output = llama_output_embd_width(lctx);

    // count outputs
    if (batch_all.logits && !embd_pooled) {
        for (uint32_t i = 0; i < n_tokens_all; ++i) {
            n_outputs += batch_all.logits[i] != 0;
        }
    } else if (lctx.logits_all || embd_pooled) {
        n_outputs = n_tokens_all;
    } else {
        // keep last output only
        n_outputs = 1;
    }

    // reserve output buffer
    n_outputs_embd = has_mtp && cparams.mtp_op_type == MTP_OP_NONE ? n_tokens_all : n_outputs;
    if (llama_output_reserve(lctx, std::max<size_t>(n_outputs, n_outputs_embd)) < std::max<size_t>(n_outputs, n_outputs_embd)) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %zu outputs\n", __func__, std::max<size_t>(n_outputs, n_outputs_embd));
        return -2;
    };

    // set output mappings
    if (batch_all.logits) {
        int32_t i_logits = 0;
        for (uint32_t i = 0; i < n_tokens_all; ++i) {
            if (batch_all.logits[i]) {
                lctx.output_ids[i] = i_logits++;
            } else {
                lctx.output_ids[i] = -1;
            }
        }
    } else if (n_outputs == 1 && n_tokens_all > 0) {
        for (uint32_t i = 0; i < n_tokens_all; ++i) {
            lctx.output_ids[i] = -1;
        }
        lctx.output_ids[n_tokens_all - 1] = 0;
    } else {
        for (uint32_t i = 0; i < std::max<uint32_t>(n_outputs, n_outputs_embd); ++i) {
            lctx.output_ids[i] = i;
        }
    }

    bool warned_qnext_mixed_repeat = false;
    for (uint32_t cur_token = 0; cur_token < n_tokens_all; ) {
#if IK_PRINT_TIMING
        auto tim1 = ggml_time_us();
#endif
        uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
        if (llm_arch_is_hybrid(model.arch) &&
                n_tokens > 1 &&
                batch_all.n_seq_id != nullptr &&
                batch_all.seq_id != nullptr) {
            bool can_check = true;
            bool any_diff = false;
            bool has_dup = false;
            llama_seq_id first_seq_id = 0;
            std::unordered_set<llama_seq_id> seen_seq_ids;
            seen_seq_ids.reserve(n_tokens);

            for (uint32_t i = 0; i < n_tokens; ++i) {
                const uint32_t idx = cur_token + i;
                if (batch_all.n_seq_id[idx] <= 0 || batch_all.seq_id[idx] == nullptr) {
                    can_check = false;
                    break;
                }

                const llama_seq_id seq_id_i = batch_all.seq_id[idx][0];
                if (i == 0) {
                    first_seq_id = seq_id_i;
                } else if (seq_id_i != first_seq_id) {
                    any_diff = true;
                }

                if (!seen_seq_ids.insert(seq_id_i).second) {
                    has_dup = true;
                }
            }

            if (can_check && any_diff && has_dup) {
                n_tokens = 1;
                if (!warned_qnext_mixed_repeat) {
                    LLAMA_LOG_WARN("%s: qwen3next mixed-sequence batch contains repeated seq_id values; falling back to single-token chunking\n", __func__);
                    warned_qnext_mixed_repeat = true;
                }
            }
        }


        // Repack the rope buffer for the ubatch depending on whether the model uses mrope.
        // * standard rope:  (flat array)                         [t; n]
        // * mrope (token):  (section-major array of rope fields) [t; n][t; n][t; n][0;     n]
        // * mrope (embd):   (section-major array of rope fields) [t; n][h; n][w; n][extra; n]
        const uint8_t rope_params_per_token = (hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ||
            hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE) ? 4 : 1;
            
        llama_pos * u_batch_pos;
        if (!batch_all.pos) {
            u_batch_pos = nullptr;
        } else if (rope_params_per_token == 1) {
            u_batch_pos = batch_all.pos + cur_token;
        } else {
            pos.resize((size_t) n_tokens * rope_params_per_token);
            u_batch_pos = pos.data();
            for (uint32_t i = 0; i < n_tokens; ++i) {
                if (batch_all.token) {
                    pos[               i] = batch_all.pos[cur_token + i]; // t
                    pos[    n_tokens + i] = batch_all.pos[cur_token + i]; // t
                    pos[2 * n_tokens + i] = batch_all.pos[cur_token + i]; // t
                    pos[3 * n_tokens + i] = 0;
                } else { //embd
                    pos[               i] = batch_all.pos[                   cur_token + i]; // t
                    pos[    n_tokens + i] = batch_all.pos[    n_tokens_all + cur_token + i]; // h
                    pos[2 * n_tokens + i] = batch_all.pos[2 * n_tokens_all + cur_token + i]; // w
                    pos[3 * n_tokens + i] = batch_all.pos[3 * n_tokens_all + cur_token + i]; // extra (this field is supplied but unused)
                }
            }
        }

        llama_batch u_batch = {
            /* .n_tokens   = */ (int32_t) n_tokens,
            /* .token      = */ batch_all.token     ? batch_all.token    + cur_token        : nullptr,
            /* .embd       = */ batch_all.embd      ? batch_all.embd     + cur_token*n_embd : nullptr,
            /* .pos        = */ u_batch_pos,
            /* .n_seq_id   = */ batch_all.n_seq_id  ? batch_all.n_seq_id + cur_token        : nullptr,
            /* .seq_id     = */ batch_all.seq_id    ? batch_all.seq_id   + cur_token        : nullptr,
            /* .logits     = */ batch_all.logits    ? batch_all.logits   + cur_token        : nullptr,
            /* .all_pos_0  = */ batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
            /* .all_pos_1  = */ batch_all.all_pos_1,
            /* .all_seq_id = */ batch_all.all_seq_id,
        };

        // count the outputs in this u_batch
        {
            int32_t n_outputs_new = 0;

            if (u_batch.logits && !embd_pooled) {
                for (uint32_t i = 0; i < n_tokens; i++) {
                    n_outputs_new += u_batch.logits[i] != 0;
                }
            } else if (n_outputs == n_tokens_all) {
                n_outputs_new = n_tokens;
            } else {
                // keep last output only
                if (cur_token + n_tokens >= n_tokens_all) {
                    n_outputs_new = 1;
                }
            }

            // needs to happen before the graph is built
            lctx.n_outputs = n_outputs_new;
        }

        int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
        GGML_ASSERT(n_threads > 0);

        // helpers for smoother batch API transition
        // after deprecating the llama_eval calls, these will be removed
        if (u_batch.pos == nullptr) {
            pos.resize(n_tokens);
            for (uint32_t i = 0; i < n_tokens; i++) {
                pos[i] = u_batch.all_pos_0 + i*u_batch.all_pos_1;
            }

            u_batch.pos = pos.data();
        }

        if (u_batch.seq_id == nullptr) {
            n_seq_id.resize(n_tokens);
            seq_id.resize(n_tokens);
            seq_id_arr.resize(n_tokens);
            for (uint32_t i = 0; i < n_tokens; i++) {
                n_seq_id[i] = 1;
                seq_id[i].resize(1);
                seq_id[i][0] = u_batch.all_seq_id;
                seq_id_arr[i] = seq_id[i].data();
            }

            u_batch.n_seq_id = n_seq_id.data();
            u_batch.seq_id = seq_id_arr.data();
        }

        // non-causal masks do not use the KV cache
        if (hparams.causal_attn) {
            int32_t ret = llama_kv_cache_update(&lctx);
            if (ret != 0) {
                return ret;
            }

            // if we have enough unused cells before the current head ->
            //   better to start searching from the beginning of the cache, hoping to fill it
            if (kv_self.head > kv_self.used + 2*n_tokens) {
                kv_self.head = 0;
            }

            if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
                return 1;
            }

            if (!kv_self.recurrent) {
                // a heuristic, to avoid attending the full cache if it is not yet utilized
                // after enough generations, the benefit from this heuristic disappears
                // if we start defragmenting the cache, the benefit from this will be more important
                const uint32_t pad = llama_kv_cache_get_padding(cparams);
                auto max_cell = llama_kv_cache_cell_max(kv_self, pad);
                kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(max_cell, pad)));
            }
        }

#if IK_PRINT_TIMING
        auto tim2 = ggml_time_us();
        printf("prelude(...): %d us\n", int(tim2-tim1));
#endif

#if IK_PRINT_TIMING
        tim1 = ggml_time_us();
#endif
        auto & prev = cparams.mtp_op_type == MTP_OP_NONE ? lctx.prev : lctx.prev_mtp;
        ggml_cgraph * gf = nullptr;
        if (!lctx.can_reuse_graph(u_batch)) {
            lctx.reset_scheduler();
            ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
#if IK_PRINT_TIMING
            tim2 = ggml_time_us();
            printf("sched_reset(...): %d us\n", int(tim2-tim1));
#endif

#if IK_PRINT_TIMING
            tim1 = ggml_time_us();
#endif
            gf = llm_build_context::llama_build_graph(lctx, u_batch, false);
#if IK_PRINT_TIMING
            tim2 = ggml_time_us();
            printf("build_graph(...): %d us\n", int(tim2-tim1));
#endif

#if IK_PRINT_TIMING
            tim1 = ggml_time_us();
#endif
            ggml_backend_sched_alloc_graph(lctx.sched, gf);
#if IK_PRINT_TIMING
            tim2 = ggml_time_us();
            printf("sched_alloc_graph(...): %d us\n", int(tim2-tim1));
#endif
            //if (u_batch.n_tokens == 1 && u_batch.embd == nullptr && lctx.cparams.graph_reuse) {
            if (u_batch.embd == nullptr && lctx.cparams.graph_reuse &&
                    !((lctx.model.arch == LLM_ARCH_GEMMA4_MTP || lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && lctx.mtp_target_ctx != nullptr)) {
                prev = std::make_unique<llama_context::Prev>(llama_context::Prev{
                        (int)u_batch.all_seq_id, (int)lctx.n_outputs, (int)lctx.kv_self.n,
                        (int)u_batch.n_tokens,
                        lctx.kv_self.save_per_step_ssm, lctx.kv_self.ckpt.per_step_max_allocated,
                        cparams.mtp_op_type, gf});
            }
        } else {
            //printf("Reusing graph with n_kv = %d, n_tokens = %d\n", (int)prev->n_kv, (int)prev->n_tokens);
            gf = prev->graph;
        }

        if (cparams.mtp_op_type != MTP_OP_NONE) {
            if (!prepare_mtp_graph_inputs(lctx, cur_token, n_tokens, n_tokens_all)) {
                return GGML_STATUS_FAILED;
            }
        }

        // the output is always the last tensor in the graph
        struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
        struct ggml_tensor * embd = nullptr;

        if (lctx.n_outputs == 0) {
            // no output
            res = nullptr;
        }
        else {
            const bool has_mtp = llama_context_has_mtp_outputs(lctx);
            const bool use_raw_mtp_embd = has_mtp && (lctx.model.arch == LLM_ARCH_GEMMA4    ||
                                                      lctx.model.arch == LLM_ARCH_GEMMA4_MTP||
                                                      lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT);
            if (cparams.embeddings || has_mtp) {
                for (int i = gf->n_nodes - 1; i >= 0; --i) {
                    if (use_raw_mtp_embd && strcmp(gf->nodes[i]->name, "result_mtp_embd") == 0) {
                        // MTP recurrent state can be wider/different than the logits head hidden state.
                        embd = gf->nodes[i];
                        break;
                    }
                    if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
                        embd = gf->nodes[i];
                        break;
                    }
                    // Strictly speaking we should use if (!use_raw_mtp_embd && strcmp(gf->nodes[i]->name, "result_norm") == 0)
                    // as Gemma4 MTP is supposed to be using embeddings before rms_norm.
                    // I don't see any significant difference between this and what we had before, so not making the change (yet).
                    if (strcmp(gf->nodes[i]->name, "result_norm") == 0) {
                        embd = gf->nodes[i];
                        break;
                    }
                }
            }
            if (cparams.embeddings && lctx.model.hparams.nextn_predict_layers == 0 && !has_mtp) {
                res = nullptr; // do not extract logits for embedding case
            } else {
                if (!embd) { // do not extract embeddings when not needed
                    GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
                }
            }
        }
        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
#if IK_PRINT_TIMING == 1
        tim1 = ggml_time_us();
#endif
        llama_set_inputs(lctx, u_batch);
#if IK_PRINT_TIMING == 1
        tim2 = ggml_time_us();
        printf("set_inputs(...): %d us\n", int(tim2-tim1));
#endif

#if IK_PRINT_TIMING
        tim1 = ggml_time_us();
#endif
        llama_graph_compute(lctx, gf, n_threads);
#if IK_PRINT_TIMING
        llama_synchronize(&lctx);
        tim2 = ggml_time_us();
        printf("graph_compute(...): %d us\n", int(tim2-tim1));
#endif

        bool reset_previous = false;
        // update the kv ring buffer
        {
            if (llama_model_has_recurrent(&lctx.model) && kv_self.head == 0) {
                reset_previous = true;
            }
            kv_self.head += n_tokens;

            // Ensure kv cache head points to a valid index.
            if (kv_self.head >= kv_self.size) {
                kv_self.head = 0;
            }
        }

        // plot the computation graph in dot format (for debugging purposes)
        //if (n_past%100 == 0) {
        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
        //}

        // extract logits
        if (res) {
#if IK_PRINT_TIMING
            tim1 = ggml_time_us();
#endif
            // Do not process logits if MTP is only updating the KV cache.
            if (cparams.mtp_op_type != MTP_OP_WARMUP) { // && cparams.mtp_op_type != MTP_OP_UPDATE_ACCEPTED) {
                ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
                GGML_ASSERT(backend_res != nullptr);
                GGML_ASSERT(lctx.logits != nullptr);

                float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
                const int32_t n_outputs_new = lctx.n_outputs;

                if (n_outputs_new) {
                    GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
                    GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);

                    if (res->ne[1] == n_tokens && n_outputs_new < n_tokens) {
                        int32_t i_out = 0;
                        if (u_batch.logits && !embd_pooled) {
                            for (uint32_t i = 0; i < n_tokens; i++) {
                                if (u_batch.logits[i]) {
                                    ggml_backend_tensor_get_async(backend_res, res, logits_out + i_out*n_vocab, i*n_vocab*sizeof(float), n_vocab*sizeof(float));
                                    i_out++;
                                }
                            }
                        } else if (cur_token + n_tokens >= n_tokens_all) {
                            ggml_backend_tensor_get_async(backend_res, res, logits_out, (n_tokens - 1)*n_vocab*sizeof(float), n_vocab*sizeof(float));
                        }
                    } else {
                        ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
                    }
                }
            }
#if IK_PRINT_TIMING
            tim2 = ggml_time_us();
            printf("get_result(...): %d us\n", int(tim2-tim1));
#endif
        }

        // extract embeddings
        //if (embd && (cparams.mtp_op_type == MTP_OP_NONE || cparams.mtp_op_type == MTP_OP_DRAFT_GEN)) {
        if (embd && cparams.mtp_op_type != MTP_OP_WARMUP) {
#if IK_PRINT_TIMING
            tim1 = ggml_time_us();
#endif
            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
            GGML_ASSERT(backend_embd != nullptr);

            switch (cparams.pooling_type) {
                case LLAMA_POOLING_TYPE_NONE:
                    {
                        // extract token embeddings
                        GGML_ASSERT(lctx.embd != nullptr);
                        float * embd_out = lctx.embd + n_outputs_prev_embd*n_embd_output;
                        const int32_t n_outputs_new_embd = has_mtp ? embd->ne[1] : lctx.n_outputs;

                        if (n_outputs_new_embd) {
                            GGML_ASSERT( n_outputs_prev_embd + n_outputs_new_embd <= n_outputs_embd);
                            GGML_ASSERT((n_outputs_prev_embd + n_outputs_new_embd)*n_embd_output <= (int64_t) lctx.embd_size);
                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new_embd*n_embd_output*sizeof(float));
                        }
                    } break;
                case LLAMA_POOLING_TYPE_MEAN:
                case LLAMA_POOLING_TYPE_CLS:
                case LLAMA_POOLING_TYPE_LAST:
                    {
                        // extract sequence embeddings
                        auto & embd_seq_out = lctx.embd_seq;
                        embd_seq_out.clear();

                        for (uint32_t i = 0; i < n_tokens; i++) {
                            const llama_seq_id seq_id = u_batch.seq_id[i][0];
                            if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                                continue;
                            }
                            embd_seq_out[seq_id].resize(n_embd);
                            ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                        }
                    } break;
                case LLAMA_POOLING_TYPE_UNSPECIFIED:
                    {
                        GGML_ABORT("unknown pooling type");
                    }
            }
#if IK_PRINT_TIMING
            tim2 = ggml_time_us();
            printf("get_embedding(...): %d us\n", int(tim2-tim1));
#endif
        }
        n_outputs_prev += lctx.n_outputs;
        n_outputs_prev_embd += (has_mtp && embd) ? embd->ne[1] : lctx.n_outputs;
        cur_token += n_tokens;
        if (reset_previous) {
            // We need to discard this graph. Otherwise, iwith CUDA graphs enabled, the graph will get resused and this will reset the
            // recurrent state for each new token. This is probably not very relevant in practice because we basically never run TG with
            // empty context, but for the sake of correctness let's just do it.
            lctx.prev.reset();
        }
        if (stop_internal_decode) {
            return -3;
        }
    }

    // set to total number of outputs in the batch, for use in llama_get_logits_ith
    lctx.n_outputs = n_outputs;
    lctx.n_outputs_embd = n_outputs_embd;

    // wait for the computation to finish (automatically done when obtaining the model output)
    //llama_synchronize(&lctx);

    // decide if we need to defrag the kv cache
    if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
        const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;

        // queue defragmentation for next llama_kv_cache_update
        if (fragmentation > cparams.defrag_thold) {
            LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);

            llama_kv_cache_defrag(kv_self);
        }
    }

    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
#if IK_PRINT_TIMING
    auto tim1 = ggml_time_us();
#endif
    if (!lctx.prev) {
        lctx.reset_scheduler();
    }
#if IK_PRINT_TIMING
        auto tim2 = ggml_time_us();
        printf("sched_reset(...): %d us\n", int(tim2-tim1));
#endif

    return 0;
}

// encode a batch of tokens by evaluating the encoder part of the transformer
//
//   - lctx:      llama context
//   - batch:     batch to evaluate
//
// return 0 on success
// return positive int on warning
// return negative int on error
//
static int llama_encode_internal(
         llama_context & lctx,
           llama_batch   batch) {

    lctx.is_encoding = true;

    const uint32_t n_tokens = batch.n_tokens;

    if (n_tokens == 0) {
        LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
        return -1;
    }

    const auto & model   = lctx.model;
    const auto & hparams = model.hparams;
    const auto & cparams = lctx.cparams;

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

    // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
    GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");

    if (lctx.t_compute_start_us == 0) {
        lctx.t_compute_start_us = ggml_time_us();
    }

    lctx.n_queued_tokens += n_tokens;

    const int64_t n_embd = hparams.n_embd;

    // TODO: simplify or deprecate
    std::vector<llama_pos> pos;
    std::vector<int32_t>                   n_seq_id;
    std::vector<llama_seq_id *>            seq_id_arr;
    std::vector<std::vector<llama_seq_id>> seq_id;

    // reserve output buffer
    if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
        LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
        return -2;
    };

    for (uint32_t i = 0; i < n_tokens; ++i) {
        lctx.output_ids[i] = i;
    }

    lctx.inp_embd_enc = NULL;
    lctx.n_outputs = n_tokens;

    const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
    GGML_ASSERT(n_threads > 0);

    // helpers for smoother batch API transition
    // after deprecating the llama_eval calls, these will be removed
    if (batch.pos == nullptr) {
        pos.resize(n_tokens);
        for (uint32_t i = 0; i < n_tokens; i++) {
            pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
        }

        batch.pos = pos.data();
    }

    if (batch.seq_id == nullptr) {
        n_seq_id.resize(n_tokens);
        seq_id.resize(n_tokens);
        seq_id_arr.resize(n_tokens);
        for (uint32_t i = 0; i < n_tokens; i++) {
            n_seq_id[i] = 1;
            seq_id[i].resize(1);
            seq_id[i][0] = batch.all_seq_id;
            seq_id_arr[i] = seq_id[i].data();
        }

        batch.n_seq_id = n_seq_id.data();
        batch.seq_id = seq_id_arr.data();
    }

    lctx.reset_scheduler();
    ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);

    ggml_cgraph * gf = llm_build_context::llama_build_graph(lctx, batch, false);

    // the output embeddings after the final encoder normalization
    struct ggml_tensor * embd = nullptr;

    // there are two cases here
    if (llama_model_has_decoder(&lctx.model)) {
        // first case is an encoder-decoder T5 model where embeddings are passed to decoder
        embd = gf->nodes[gf->n_nodes - 1];
        GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
    } else {
        // second case is an encoder-only T5 model
        if (cparams.embeddings) {
            // only output embeddings if required
            embd = gf->nodes[gf->n_nodes - 1];
            if (strcmp(embd->name, "result_embd_pooled") != 0) {
                embd = gf->nodes[gf->n_nodes - 2];
            }
            GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
        }
    }

    ggml_backend_sched_alloc_graph(lctx.sched, gf);

    llama_set_inputs(lctx, batch);

    llama_graph_compute(lctx, gf, n_threads);

    // extract embeddings
    if (embd) {
        if (cparams.mtp_op_type == MTP_OP_NONE) {
            ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
            GGML_ASSERT(backend_embd != nullptr);

            if (llama_model_has_decoder(&lctx.model)) {
                lctx.embd_enc.resize(n_tokens*n_embd);
                float * embd_out = lctx.embd_enc.data();

                ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));

                // remember the sequence ids used during the encoding - needed for cross attention later
                lctx.seq_ids_enc.resize(n_tokens);
                for (uint32_t i = 0; i < n_tokens; i++) {
                    for (int s = 0; s < batch.n_seq_id[i]; s++) {
                        llama_seq_id seq_id = batch.seq_id[i][s];
                        lctx.seq_ids_enc[i].insert(seq_id);
                    }
                }
            } else {
                GGML_ASSERT(lctx.embd != nullptr);

                switch (cparams.pooling_type) {
                    case LLAMA_POOLING_TYPE_NONE:
                        {
                            // extract token embeddings
                            GGML_ASSERT(lctx.embd != nullptr);
                            float * embd_out = lctx.embd;

                            GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
                            ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
                        } break;
                    case LLAMA_POOLING_TYPE_MEAN:
                    case LLAMA_POOLING_TYPE_CLS:
                    case LLAMA_POOLING_TYPE_LAST:
                        {
                            // extract sequence embeddings
                            auto & embd_seq_out = lctx.embd_seq;
                            embd_seq_out.clear();

                            for (uint32_t i = 0; i < n_tokens; i++) {
                                const llama_seq_id seq_id = batch.seq_id[i][0];
                                if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
                                    continue;
                                }
                                embd_seq_out[seq_id].resize(n_embd);
                                ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
                            }
                        } break;
                    case LLAMA_POOLING_TYPE_UNSPECIFIED:
                        {
                            GGML_ABORT("unknown pooling type");
                        }
                }
            }
        }
    }

    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
    lctx.reset_scheduler();

    return 0;
}

// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
    auto & kv_self = lctx.kv_self;

    const auto & hparams = lctx.model.hparams;

    const uint32_t n_layer = hparams.n_layer;

    const uint32_t n_kv   = llama_kv_cache_cell_max(kv_self);
    const uint32_t n_used = kv_self.used;

    assert(n_used <= n_kv);

    //const int64_t t_start = ggml_time_us();

    // number of cells moved
    uint32_t n_moves = 0;

    // each move requires 6*n_layer tensors (see build_defrag)
    //   - source view, destination view, copy operation
    //   - x2 for keys and values
    //const uint32_t max_moves = model.max_nodes()/(6*n_layer);
    // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
    const uint32_t max_moves = (lctx.model.max_nodes(1) - 2*n_layer)/(6*n_layer);

    // determine which KV cells to move where
    //
    //  cell i moves to ids[i]
    //
    //  if ids[i] == i || ids[i] == n_kv, then cell i is not moved
    //
    std::vector<uint32_t> ids(n_kv, n_kv);

    for (uint32_t i0 = 0; i0 < n_used; ++i0) {
        const auto & cell0 = kv_self.cells[i0];

        if (!cell0.is_empty()) {
            ids[i0] = i0;

            continue;
        }

        // found a hole - fill it with data from the end of the cache

        uint32_t nh = 1;

        // determine the size of the hole
        while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
            nh++;
        }

        uint32_t nf = 0;
        uint32_t is = n_kv - 1;

        // starting from the end, find nh non-empty cells
        for (; is > i0; --is) {
            const auto & cell1 = kv_self.cells[is];

            if (cell1.is_empty() || ids[is] != n_kv) {
                continue;
            }

            // non-empty cell which is not yet moved
            nf++;

            if (nf == nh) {
                break;
            }
        }

        // this can only happen if `n_used` is not accurate, which would be a bug
        GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");

        nf = 0;

        uint32_t i1 = is;

        // are we moving a continuous block of memory?
        bool cont = false;

        // should we stop searching for the next move?
        bool stop = false;

        // go back and move the nf cells to the hole
        for (; i1 < n_kv; ++i1) {
            auto & cell1 = kv_self.cells[i1];

            if (cell1.is_empty() || ids[i1] != n_kv) {
                if (n_moves == max_moves) {
                    stop = true;
                    break;
                }

                cont = false;
                continue;
            }

            // this cell goes to (i0 + nf)
            ids[i1] = i0 + nf;

            // move the cell meta data
            kv_self.cells[i0 + nf] = cell1;

            // clear the old cell and move the head there
            cell1 = llama_kv_cell();
            kv_self.head = n_used;

            if (!cont) {
                n_moves++;
                cont = true;
            }

            nf++;

            if (nf == nh) {
                break;
            }
        }

        if (stop || n_moves == max_moves) {
            break;
        }

        //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);

        i0 += nh - 1;
    }

    if (n_moves == 0) {
        return;
    }

    //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);

    //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);

#if 0
    // CPU defrag
    //
    // TODO: optimizations are possible:
    //       - multiple threads
    //       - avoid copying to the host memory when already there
    //
    // likely not worth the effort, as we have ggml_graph based defrag
    //

    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();

    const uint32_t kv_size = kv_self.size;

    std::vector<uint8_t> buf_k;
    std::vector<uint8_t> buf_v;

    for (uint32_t il = 0; il < n_layer; ++il) {
        const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
        const size_t k_size     = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);

        const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
        const size_t v_size    = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);

        buf_k.resize(k_size);
        buf_v.resize(v_size);

        ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
        ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());

        // batch move [i, i+nm) to [id, id+nm)
        // note: cells can move only to a lower index
        for (uint32_t i = 0; i < n_kv; ++i) {
            const uint32_t id = ids[i];

            if (i == id || id == n_kv) {
                continue;
            }

            uint32_t nm = 1;

            while (i + nm < n_kv && ids[i + nm] == id + nm) {
                nm++;
            }

            // move keys
            {
                const int64_t os =  i*k_size_row;
                const int64_t od = id*k_size_row;

                memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
            }

            // move values (note: they are transposed)
            {
                const int64_t os =  i;
                const int64_t od = id;

                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
                    memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
                }
            }

            i += nm - 1;
        }

        ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
        ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
    }
#else
    // ggml_graph defrag

    lctx.reset_scheduler();

    ggml_cgraph * gf = llm_build_context::llama_build_graph_defrag(lctx, ids);

    llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
#endif

    //const int64_t t_end = ggml_time_us();

    //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
}

static bool get_can_shift(struct llama_context & lctx) {
    bool no_shift = lctx.model.is_mla_model();
    no_shift = no_shift || lctx.model.hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE;
    return !no_shift;
}

static int32_t llama_kv_cache_update_internal(struct llama_context & lctx) {
    bool need_reserve = false;

    // apply K-shift if needed
    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
        if (!get_can_shift(lctx)) {
            return 1;
        }

        {
            lctx.reset_scheduler();

            ggml_cgraph * gf = llm_build_context::llama_build_graph_k_shift(lctx);

            ggml_backend_sched_alloc_graph(lctx.sched, gf);

            llama_set_k_shift(lctx);

            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);

            need_reserve = true;
        }

        {
            auto & kv_self = lctx.kv_self;

            kv_self.has_shift = false;

            for (uint32_t i = 0; i < kv_self.size; ++i) {
                kv_self.cells[i].delta = 0;
            }
        }
    }

    if ((lctx.kv_self.recurrent || llama_kv_has_qnext_state_storage(lctx.kv_self)) && lctx.kv_self.do_copy) {
        {
            lctx.reset_scheduler();

            ggml_cgraph * gf = llm_build_context::llama_build_graph_s_copy(lctx);

            ggml_backend_sched_alloc_graph(lctx.sched, gf);

            llama_set_s_copy(lctx);

            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);

            need_reserve = true;
        }

        {
            auto & kv_self = lctx.kv_self;

            kv_self.do_copy = false;

            for (uint32_t i = 0; i < kv_self.size; ++i) {
                kv_self.cells[i].src = i;
            }
        }
    }

    // defragment the KV cache if needed
    if (lctx.kv_self.do_defrag) {
        llama_kv_cache_defrag_internal(lctx);

        need_reserve = true;

        lctx.kv_self.do_defrag = false;
    }

    // reserve a worst case graph again
    if (need_reserve) {
        // TODO: extract to a function
        // build worst-case graph
        int n_tokens = (int)std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
        int n_past = lctx.cparams.n_ctx - n_tokens;
        llama_token token = llama_token_bos(&lctx.model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
        ggml_cgraph * gf = llm_build_context::llama_build_graph(lctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true, lctx.cparams.worst_graph_tokens);

        // initialize scheduler with the worst-case graph
        lctx.reset_scheduler();
        if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
            LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
        }
    }
    return 0;
}

static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

    ggml_context * ctx = nullptr;
    struct gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ true,
        /* .ctx      = */ &ctx,
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
    if (!ctx_gguf) {
        throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
    }

    // check metadata
    {
        auto get_kv_str = [&](const std::string & key) -> std::string {
            int id = gguf_find_key(ctx_gguf, key.c_str());
            return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
        };
        auto get_kv_f32 = [&](const std::string & key) -> float {
            int id = gguf_find_key(ctx_gguf, key.c_str());
            return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
        };
        LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);

        auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
        if (general_type != "adapter") {
            gguf_free(ctx_gguf);
            throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
        }

        auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
        auto general_arch = llm_arch_from_string(general_arch_str);
        if (general_arch != model->arch) {
            gguf_free(ctx_gguf);
            throw std::runtime_error("model arch and LoRA arch mismatch");
        }

        auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
        if (adapter_type != "lora") {
            gguf_free(ctx_gguf);
            throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
        }

        adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
    }

    int n_tensors = gguf_get_n_tensors(ctx_gguf);

    // contexts for each buffer type
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            // add a new context
            struct ggml_init_params params = {
                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
            };
            ggml_context * buft_ctx = ggml_init(params);
            ctx_map[buft] = buft_ctx;
            return buft_ctx;
        };
        return it->second;
    };

    // bundle lora_a and lora_b into pairs
    std::map<std::string, llama_lora_weight> ab_map;
    auto str_endswith = [](const std::string & str, const std::string & suffix) {
        return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
    };
    for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
        std::string name(cur->name);
        if (str_endswith(name, ".lora_a")) {
            replace_all(name, ".lora_a", "");
            if (ab_map.find(name) == ab_map.end()) {
                ab_map[name] = llama_lora_weight(cur, nullptr);
            } else {
                ab_map[name].a = cur;
            }
        } else if (str_endswith(name, ".lora_b")) {
            replace_all(name, ".lora_b", "");
            if (ab_map.find(name) == ab_map.end()) {
                ab_map[name] = llama_lora_weight(nullptr, cur);
            } else {
                ab_map[name].b = cur;
            }
        } else {
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
        }
    }

    // add tensors
    for (auto & it : ab_map) {
        const std::string & name = it.first;
        llama_lora_weight & w = it.second;

        if (!w.a || !w.b) {
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
        }

        // device buft and device ctx
        auto * model_tensor = llama_get_model_tensor(model, name.c_str());
        if (!model_tensor) {
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
        }
        struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
        // validate tensor shape
        if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            throw std::runtime_error("tensor '" + name + "' has incorrect shape");
        }
        if (w.a->ne[1] != w.b->ne[0]) {
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
        }
        // save tensor to adapter
        struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
        struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
        ggml_set_name(tensor_a, w.a->name);
        ggml_set_name(tensor_b, w.b->name);
        adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
    }

    // allocate tensors / buffers and zero
    {
        adapter.ctxs.reserve(ctx_map.size());
        adapter.bufs.reserve(ctx_map.size());
        for (auto it : ctx_map) {
            ggml_backend_buffer_type_t buft = it.first;
            ggml_context * ctx_dev = it.second;
            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
            if (!buf) {
                gguf_free(ctx_gguf);
                ggml_free(ctx);
                throw std::runtime_error("failed to allocate buffer for lora adapter\n");
            }
            LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
            adapter.ctxs.push_back(ctx_dev);
            adapter.bufs.push_back(buf);
        }
    }

    // set tensor data
    {
        llama_file gguf_file(path_lora, "rb");
        std::vector<uint8_t> read_buf;
        auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
            size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
            size_t size = ggml_nbytes(orig);
            read_buf.resize(size);
            gguf_file.seek(offs, SEEK_SET);
            gguf_file.read_raw(read_buf.data(), size);
            ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
        };
        for (auto & it : adapter.ab_map) {
            auto orig = ab_map[it.first];
            auto dev  = it.second;
            set_tensor(orig.a, dev.a);
            set_tensor(orig.b, dev.b);
        }
    }

    LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);

    // free ctx for reading gguf
    gguf_free(ctx_gguf);
    ggml_free(ctx);
}

int32_t llama_lora_adapter_set(
            struct llama_context * ctx,
            struct llama_lora_adapter * adapter,
            float scale) {
    if (ctx->cparams.flash_attn) {
        LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
        return -1;
    }
    ctx->lora_adapters[adapter] = scale;
    return 0;
}

int32_t llama_lora_adapter_remove(
            struct llama_context * ctx,
            struct llama_lora_adapter * adapter) {
    auto pos = ctx->lora_adapters.find(adapter);
    if (pos != ctx->lora_adapters.end()) {
        ctx->lora_adapters.erase(pos);
        return 0;
    }
    return -1;
}

void llama_lora_adapter_clear(struct llama_context * ctx) {
    ctx->lora_adapters.clear();
}

void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
    delete adapter;
}

//
// interface implementation
//
struct llama_model_params llama_model_default_params() {
    struct llama_model_params result = {
        /*.devices                 =*/ nullptr,
        /*.n_gpu_layers                =*/ 0,
        /*.mla                         =*/ 0,
        /*.split_mode                  =*/ LLAMA_SPLIT_MODE_LAYER,
        /*.main_gpu                    =*/ 0,
        /*.max_gpu                     =*/ 0,
        /*.ncmoe                       =*/ 0,
        /*.type_k                      =*/ GGML_TYPE_F16,
        /*.type_v                      =*/ GGML_TYPE_F16,
        /*.max_ctx_size                =*/ 0,
        /*.n_seq_max                   =*/ 1,
        /*.n_ubatch                    =*/ 512,
        /*.amb                         =*/ 0,
        /*.fit_margin                  =*/ 0,
        /*.fit                         =*/ false,
        /*.worst_graph_tokens          =*/ 0,
        /*.type_first_k                =*/ GGML_TYPE_F16,
        /*.type_last_k                 =*/ GGML_TYPE_F16,
        /*.type_first_v                =*/ GGML_TYPE_F16,
        /*.type_last_v                 =*/ GGML_TYPE_F16,
        /*.n_first_k                   =*/ -1,
        /*.n_last_k                    =*/ -1,
        /*.n_first_v                   =*/ -1,
        /*.n_last_v                    =*/ -1,
        /*.extra_output_type           =*/ GGML_TYPE_COUNT,
        /*.tensor_split                =*/ nullptr,
        /*.fit_margin_array            =*/ nullptr,
        /*.rpc_servers                 =*/ nullptr,
        /*.progress_callback           =*/ nullptr,
        /*.progress_callback_user_data =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
        /*.tensor_buft_overrides       =*/ nullptr,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
        /*.repack_tensors              =*/ false,
        /*.use_thp                     =*/ false,
        /*.validate_quants             =*/ false,
        /*.merge_qkv                   =*/ false,
        /*.merge_up_gate_exps          =*/ false,
        /*.mtp                         =*/ false,
        /*.dry_run                     =*/ false,
        /*.flash_attn                  =*/ true,
        /*.defer_experts               =*/ false,
    };

#ifdef GGML_USE_METAL
    // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
    result.n_gpu_layers = 999;
#endif

    return result;
}

struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
        /*.n_ctx                       =*/ 512,
        /*.n_batch                     =*/ 2048,
        /*.n_ubatch                    =*/ 512,
        /*.n_seq_max                   =*/ 1,
        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
        /*.max_extra_alloc             =*/ 256,
        /*.worst_case_tokens           =*/ 0,
        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
        /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
        /*.attention_type              =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
        /*.rope_freq_base              =*/ 0.0f,
        /*.rope_freq_scale             =*/ 0.0f,
        /*.yarn_ext_factor             =*/ -1.0f,
        /*.yarn_attn_factor            =*/ -1.0f,
        /*.yarn_beta_fast              =*/ -1.0f,
        /*.yarn_beta_slow              =*/ -1.0f,
        /*.yarn_orig_ctx               =*/ 0,
        /*.defrag_thold                =*/ -1.0f,
        /*.cb_eval                     =*/ nullptr,
        /*.cb_eval_user_data           =*/ nullptr,
        /*.type_k                      =*/ GGML_TYPE_F16,
        /*.type_v                      =*/ GGML_TYPE_F16,
        /*.type_reduce                 =*/ GGML_TYPE_F16,
        /*.type_graph_attn             =*/ GGML_TYPE_F16,
        /*.type_first_k                =*/ GGML_TYPE_F16,
        /*.type_last_k                 =*/ GGML_TYPE_F16,
        /*.type_first_v                =*/ GGML_TYPE_F16,
        /*.type_last_v                 =*/ GGML_TYPE_F16,
        /*.n_first_k                   =*/ -1,
        /*.n_last_k                    =*/ -1,
        /*.n_first_v                   =*/ -1,
        /*.n_last_v                    =*/ -1,
        /*.logits_all                  =*/ false,
        /*.embeddings                  =*/ false,
        /*.offload_kqv                 =*/ true,
        /*.flash_attn                  =*/ true,
        /*.mla_attn                    =*/ 3,
        /*.attn_max_batch              =*/ 0,
        /*.fused_moe_up_gate           =*/ true,
        /*.grouped_expert_routing      =*/ false,
        /*.fused_up_gate               =*/ true,
        /*.fused_mmad                  =*/ true,
        /*.rope_cache                  =*/ false,
        /*.graph_reuse                 =*/ true,
        /*.min_experts                 =*/ -1,
        /*.thtesh_experts              =*/ 0.0f,
        /*.only_active_experts         =*/ false,
        /*.k_cache_hadamard            =*/ false,
        /*.v_cache_hadamard            =*/ false,
        /*.split_mode_graph_scheduling =*/ false,
        // /*.split_mode_f16           =*/ true,
        /*.scheduler_async             =*/ false,
        /*.mtp                         =*/ false,
        /*.mtp_op_type                 =*/ MTP_OP_NONE,
        /*.abort_callback              =*/ nullptr,
        /*.abort_callback_data         =*/ nullptr,
        /*.offload_policy              =*/ nullptr,
        /*.cuda_params                 =*/ nullptr,
    };

    return result;
}

struct llama_model_quantize_params llama_model_quantize_default_params() {
    struct llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
        /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
        /*.attn_q_type                 =*/ GGML_TYPE_COUNT,
        /*.attn_k_type                 =*/ GGML_TYPE_COUNT,
        /*.attn_v_type                 =*/ GGML_TYPE_COUNT,
        /*.attn_qkv_type               =*/ GGML_TYPE_COUNT,
        /*.attn_output_type            =*/ GGML_TYPE_COUNT,
        /*.ffn_gate_type               =*/ GGML_TYPE_COUNT,
        /*.ffn_down_type               =*/ GGML_TYPE_COUNT,
        /*.ffn_up_type                 =*/ GGML_TYPE_COUNT,
        /*.ffn_gat_inp_type            =*/ GGML_TYPE_COUNT,
        /*.extra_output_type           =*/ GGML_TYPE_COUNT,
        /*.allow_requantize            =*/ false,
        /*.quantize_output_tensor      =*/ true,
        /*.only_copy                   =*/ false,
        /*.pure                        =*/ false,
        /*.keep_split                  =*/ false,
        /*.ignore_imatrix_rules        =*/ false,
        /*.only_repack                 =*/ false,
        /*.dry_run                     =*/ false,
        /*.partial_requant             =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
        /*.custom_quants               =*/ nullptr,
        /*.repack_pattern              =*/ nullptr,
        /*.user_data                   =*/ nullptr,
    };

    return result;
}

size_t llama_max_devices(void) {
#if defined(GGML_USE_RPC)
    return GGML_RPC_MAX_SERVERS;
#elif defined(GGML_USE_METAL)
    return 1;
#elif defined(GGML_USE_CUDA)
    return GGML_CUDA_MAX_DEVICES;
#elif defined(GGML_USE_SYCL)
    return GGML_SYCL_MAX_DEVICES;
#elif defined(GGML_USE_VULKAN)
    return GGML_VK_MAX_DEVICES;
#elif defined(GGML_USE_CANN)
    return GGML_CANN_MAX_DEVICES;
#else
    return 1;
#endif
}

bool llama_supports_mmap(void) {
    return llama_mmap::SUPPORTED;
}

bool llama_supports_mlock(void) {
    return llama_mlock::SUPPORTED;
}

bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL)   || defined(GGML_USE_VULKAN) || \
    defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
    // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
    return true;
#else
    return false;
#endif
}

void llama_backend_init(void) {
    ggml_time_init();

    // needed to initialize f16 tables
    {
        struct ggml_init_params params = { 0, NULL, false };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }
}

void llama_numa_init(enum ggml_numa_strategy numa) {
    if (numa != GGML_NUMA_STRATEGY_DISABLED) {
        ggml_numa_init(numa);
    }
}

void llama_backend_free(void) {
    ggml_quantize_free();
}

int64_t llama_time_us(void) {
    return ggml_time_us();
}

static std::string create_rpc_name(std::string endpoint, uint32_t device) {
    std::string dev_name = "RPC" + std::to_string(device) + "[" + std::string(endpoint) + "]";
    return dev_name;
}

struct llama_model * llama_model_load_from_file(
        const char * path_model,
        struct llama_model_params   params) {
    ggml_time_init();

    llama_model * model = new llama_model;

    unsigned cur_percentage = 0;
    if (params.progress_callback == NULL) {
        params.progress_callback_user_data = &cur_percentage;
        params.progress_callback = [](float progress, void * ctx) {
            unsigned * cur_percentage_p = (unsigned *) ctx;
            unsigned percentage = (unsigned) (100 * progress);
            while (percentage > *cur_percentage_p) {
                *cur_percentage_p = percentage;
                LLAMA_LOG_INFO(".");
                if (percentage >= 100) {
                    LLAMA_LOG_INFO("\n");
                }
            }
            return true;
        };
    }
    model->set_tensor_overrides(params);
    // model->devices hold device indices that are used to offload
    // use model->devices to determine offload device
    // if no device is specified, all device are included
    // if device is specified, only those in the devices are included in the model->devices

    std::vector<std::string> params_devices;
    if (params.devices && !striequals(params.devices, "")) {
        params_devices = string_split(params.devices, ",");
    }

    std::map<std::string, int32_t> buffer_names;
    std::vector<std::string> gpu_names;
    bool has_rpc = params.rpc_servers != nullptr && params.rpc_servers[0] != '\0';
    int32_t idx = 0;
    int dev_count = (int)llama_get_device_count(*model);
    // list all buffer type names
    for (idx = 0; idx < dev_count; idx++) {
        ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, idx);
        const char* name = ggml_backend_buft_name(buft);
        buffer_names.insert({ std::string(name), idx });
        gpu_names.push_back(std::string(name));
    }
    if (has_rpc) {
        model->rpc_servers = extract_device_from_rpc_device(string_split(params.rpc_servers, ","));
        for (auto rpc : model->rpc_servers) {
            buffer_names.insert({ create_rpc_name(rpc.endpoint, rpc.device), idx});
            idx++;
        }
    }
    std::vector<std::string> device_names;
    if (params_devices.size()) {
        device_names = params_devices;
    } else {
        // add RPC servers at the front of the list to minimize the network transfers
        if (has_rpc) {
            for (auto& it : model->rpc_servers) {
                device_names.push_back(create_rpc_name(it.endpoint, it.device));
            }
        }
        device_names.insert(device_names.end(), gpu_names.begin(), gpu_names.end());
    }

    for (auto & device : device_names) {
        if (buffer_names.count(device)) {
            model->devices.push_back(buffer_names[device]);
        } else {
            LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str());
        }
    }

    // no gpu used, so set layers offload to be 0
    if (!model->devices.size()) {
        params.n_gpu_layers = 0;
        LLAMA_LOG_INFO("CPU: using device CPU\n");
    } else {
        for (auto i : model->devices) {
            ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i);
            const char* name = ggml_backend_buft_name(buft);
            const char* description = name;
            size_t description_size = llama_get_device_memory(*model, i);
            LLAMA_LOG_INFO("%s: using device %s - %zu MiB free\n",
                name, description,
                description_size / 1024 / 1024);
        }
    }
    int status = llama_model_load(path_model, *model, params);
    GGML_ASSERT(status <= 0);
    if (status < 0) {
        if (status == -1) {
            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
        } else if (status == -2) {
            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
        }
        delete model;
        return nullptr;
    }

    return model;
}

void llama_free_model(struct llama_model * model) {
    delete model;
}

static void llama_repack_up_gate_exps(llama_context & lctx) {
    if (lctx.cparams.mtp_op_type != MTP_OP_NONE) {
        return;
    }
    auto & model = lctx.model;
    bool needs_repack = false;
    for (auto & l : model.layers) {
        if (l.ffn_up_gate_exps && l.ffn_up_exps && l.ffn_gate_exps &&
           !l.ffn_up_gate_exps->extra) {
            needs_repack = true; break;
        }
    }
    if (!needs_repack) return;

    std::vector<char> aux_buffer_up, aux_buffer_gate, aux_buffer_up_gate;
    for (int il = 0; il < int(model.layers.size()); ++il) {
        auto & l = model.layers[il];
        if (l.ffn_up_gate_exps && l.ffn_up_exps && l.ffn_gate_exps &&
           !l.ffn_up_gate_exps->extra) {
            GGML_ASSERT(l.ffn_up_exps->type == l.ffn_gate_exps->type);
            if (l.ffn_up_gate_exps->type != l.ffn_up_exps->type) {
                auto [other_type, _] = interleaved_properties(l.ffn_up_gate_exps->type);
                GGML_ASSERT(other_type == l.ffn_up_exps->type);
            }
            GGML_ASSERT(l.ffn_up_gate_exps->ne[0] == l.ffn_up_exps->ne[0] && l.ffn_up_gate_exps->ne[0] == l.ffn_gate_exps->ne[0]);
            GGML_ASSERT(l.ffn_up_gate_exps->ne[2] == l.ffn_up_exps->ne[2] && l.ffn_up_gate_exps->ne[2] == l.ffn_gate_exps->ne[2]);
            GGML_ASSERT(l.ffn_up_gate_exps->ne[1] == l.ffn_up_exps->ne[1] + l.ffn_gate_exps->ne[1]);
            auto nbytes = ggml_nbytes(l.ffn_up_exps);
            GGML_ASSERT(nbytes == ggml_nbytes(l.ffn_gate_exps));
            if (nbytes > aux_buffer_up.size()) {
                aux_buffer_up.resize(nbytes);
            }
            if (nbytes > aux_buffer_gate.size()) {
                aux_buffer_gate.resize(nbytes);
            }
            LLAMA_LOG_INFO("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
            ggml_backend_tensor_get(l.ffn_up_exps, aux_buffer_up.data(), 0, nbytes);
            ggml_backend_tensor_get(l.ffn_gate_exps, aux_buffer_gate.data(), 0, nbytes);
            if (aux_buffer_up_gate.size() < 2*nbytes) {
                aux_buffer_up_gate.resize(2*nbytes);
            }
            size_t offset_up_gate = 0;
            size_t offset_up = 0;
            auto expert_size = l.ffn_up_exps->ne[1]*l.ffn_up_exps->nb[1];
            for (int i2 = 0; i2 < (int)l.ffn_up_gate_exps->ne[2]; ++i2) {
                std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_gate.data() + offset_up, expert_size);
                offset_up_gate += expert_size;
                std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_up.data() + offset_up, expert_size);
                offset_up_gate += expert_size;
                offset_up      += expert_size;
            }
            ggml_backend_tensor_set(l.ffn_up_gate_exps, aux_buffer_up_gate.data(), 0, 2*expert_size*l.ffn_up_gate_exps->ne[2]);
            if (l.ffn_up_gate_exps_b && l.ffn_up_exps_b && l.ffn_gate_exps_b) {
                nbytes = ggml_nbytes(l.ffn_up_exps_b);
                GGML_ASSERT(nbytes == ggml_nbytes(l.ffn_gate_exps_b));
                if (nbytes > aux_buffer_up.size()) {
                    aux_buffer_up.resize(nbytes);
                }
                if (nbytes > aux_buffer_gate.size()) {
                    aux_buffer_gate.resize(nbytes);
                }
                LLAMA_LOG_INFO("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
                ggml_backend_tensor_get(l.ffn_up_exps_b, aux_buffer_up.data(), 0, nbytes);
                ggml_backend_tensor_get(l.ffn_gate_exps_b, aux_buffer_gate.data(), 0, nbytes);
                if (aux_buffer_up_gate.size() < 2*nbytes) {
                    aux_buffer_up_gate.resize(2*nbytes);
                }
                offset_up_gate = 0;
                offset_up = 0;
                expert_size = l.ffn_up_exps_b->nb[1];
                for (int i1 = 0; i1 < (int)l.ffn_up_gate_exps_b->ne[1]; ++i1) {
                    std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_gate.data() + offset_up, expert_size);
                    offset_up_gate += expert_size;
                    std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_up.data() + offset_up, expert_size);
                    offset_up_gate += expert_size;
                    offset_up      += expert_size;
                }
                ggml_backend_tensor_set(l.ffn_up_gate_exps_b, aux_buffer_up_gate.data(), 0, 2*expert_size*l.ffn_up_gate_exps_b->ne[1]);
            }
        }
    }
}

struct llama_context * llama_init_from_model(
                 struct llama_model * model,
        struct llama_context_params   params) {

    if (!model) {
        LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
        return nullptr;
    }

    if (params.n_batch == 0 && params.n_ubatch == 0) {
        LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
        return nullptr;
    }

    if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
        LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
        return nullptr;
    }

    if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
        params.flash_attn = false;
    }

    //if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
    //    LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
    //    params.flash_attn = false;
    //}

    if (params.type_v != GGML_TYPE_F16 && params.type_v != GGML_TYPE_BF16 && !params.flash_attn) {
        LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
        return nullptr;
    }

    if (params.k_cache_hadamard && !ggml_is_quantized(params.type_k)) {
        LLAMA_LOG_WARN("%s: there is no point in Hadamard transforms with not quantized K-cache. Turning K-cache Hadamard off\n", __func__);
        params.k_cache_hadamard = false;
    }
    if (params.k_cache_hadamard) {
        int nok = 0;
        for (int il = 0; il < model->hparams.n_layer; ++il) {
            if (model->hadamard_size_k(il) > 0) ++nok;
        }
        if (nok == 0) {
            LLAMA_LOG_WARN("%s: no layer allows a K-head Hadamard transform. Turning K-cache Hadamard off\n", __func__);
            params.k_cache_hadamard = false;
        }
    }

    if (params.v_cache_hadamard && !ggml_is_quantized(params.type_v)) {
        LLAMA_LOG_WARN("%s: there is no point in Hadamard transforms with not quantized V-cache. Turning V-cache Hadamard off\n", __func__);
        params.v_cache_hadamard = false;
    }
    if (params.v_cache_hadamard) {
        int nok = 0;
        for (int il = 0; il < model->hparams.n_layer; ++il) {
            if (model->hadamard_size_v(il) > 0) ++nok;
        }
        if (nok == 0) {
            LLAMA_LOG_WARN("%s: no layer allows a V-head Hadamard transform. Turning V-cache Hadamard off\n", __func__);
            params.v_cache_hadamard = false;
        }
    }

    llama_context * ctx = new llama_context(*model);

    // add devices to ctx->cparams from model
    for (int i : model->devices) {
        ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i);
        const char* name = ggml_backend_buft_name(buft);
        std::string device(name);
        ctx->cparams.devices.push_back(device);
    }

    const auto & hparams = model->hparams;
    auto       & cparams = ctx->cparams;


    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
    cparams.n_threads        = params.n_threads;
    cparams.n_threads_batch  = params.n_threads_batch;
    cparams.yarn_ext_factor  = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
    cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
    cparams.yarn_beta_fast   = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
    cparams.yarn_beta_slow   = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
    cparams.defrag_thold     = params.defrag_thold;
    cparams.embeddings       = params.embeddings;
    cparams.offload_kqv      = params.offload_kqv;
    cparams.flash_attn       = params.flash_attn;
    cparams.mla_attn         = params.mla_attn;
    cparams.attn_max_batch   = params.attn_max_batch;
    cparams.fused_moe_up_gate= params.fused_moe_up_gate;
    cparams.grouped_expert_routing = params.grouped_expert_routing;
    cparams.fused_up_gate    = params.fused_up_gate;
    cparams.fused_mmad       = params.fused_mmad;
    cparams.rope_cache       = params.rope_cache;
    cparams.graph_reuse      = params.graph_reuse;
    cparams.k_cache_hadamard = params.k_cache_hadamard;
    cparams.v_cache_hadamard = params.v_cache_hadamard;
    // Folding H into wv_b/wk_b_pp permanently mutates the model; a later context
    // on the same model with khad=false would consume an H-applied weight against
    // an un-applied cache and produce wrong values. Force khad=true to keep math
    // consistent for the rest of the model's lifetime.
    if (model->khad_pretransformed && !cparams.k_cache_hadamard) {
        LLAMA_LOG_WARN("%s: model has Hadamard-folded wv_b/wk_b_pp; forcing k_cache_hadamard=true\n", __func__);
        cparams.k_cache_hadamard = true;
    }
    if (cparams.k_cache_hadamard) {
        llm_apply_khad_pretransform(*model);
    }
    cparams.split_mode_graph_scheduling = params.split_mode_graph_scheduling;
    //cparams.split_mode_f16   = params.split_mode_f16;
    cparams.scheduler_async  = params.scheduler_async;
    cparams.min_experts      = params.min_experts;
    cparams.thresh_experts   = params.thresh_experts;
    cparams.cuda_params      = params.cuda_params;
    cparams.mtp              = params.mtp;
    cparams.worst_graph_tokens = params.worst_case_tokens;

    cparams.reduce_type      = params.type_reduce;
    cparams.graph_attn_precision = params.type_graph_attn;
    if (cparams.graph_attn_precision != GGML_TYPE_F16 && cparams.graph_attn_precision != GGML_TYPE_F32) {
        throw std::runtime_error(format("--graph-attn-precision must be f16 or f32, got %s",
                                        ggml_type_name(cparams.graph_attn_precision)));
    }
    cparams.pooling_type     = params.pooling_type;

    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;

    // this is necessary due to kv_self.n being padded later during inference
    cparams.n_ctx            = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));

    // with causal attention, the batch size is limited by the context size
    cparams.n_batch          = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;

    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
        cparams.n_batch = GGML_KQ_MASK_PAD;
    }

    cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);

    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
                                                              hparams.n_ctx_train;

    cparams.cb_eval           = params.cb_eval;
    cparams.cb_eval_user_data = params.cb_eval_user_data;

    auto rope_scaling_type = params.rope_scaling_type;
    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
        rope_scaling_type = hparams.rope_scaling_type_train;
    }

    if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
    }

    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
    }
    cparams.yarn_attn_factor *= hparams.rope_attn_factor;

    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
        if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
            cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
        } else {
            cparams.pooling_type = hparams.pooling_type;
        }
    }

    if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
        cparams.causal_attn = hparams.causal_attn;
    } else {
        cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
    }

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }

    if (!model->is_mla_model() && cparams.mla_attn != 0) {
        cparams.mla_attn = 0;
    } else {
        if (model->n_gpu_layers > 0 && model->n_gpu_layers < model->hparams.n_layer && cparams.mla_attn != 3) {
            LLAMA_LOG_WARN("=============================================================================\n");
            LLAMA_LOG_WARN("MLA models with ngl < n_layer and split mode graph do not work with mla = %d\n", cparams.mla_attn);
            LLAMA_LOG_WARN("    => changing mla to 3\n");
            LLAMA_LOG_WARN("=============================================================================\n");
            cparams.mla_attn = 3;
        }
    }
    if (model->arch == LLM_ARCH_OPENAI_MOE && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
        if (cparams.reduce_type == GGML_TYPE_F16) {
            LLAMA_LOG_WARN("=====================================================================\n");
            LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n");
            LLAMA_LOG_WARN("    => changing cparams.reduce_type to f32\n");
            LLAMA_LOG_WARN("=====================================================================\n");
            cparams.reduce_type = GGML_TYPE_F32;
        }
    }
    if ((model->arch == LLM_ARCH_MELLUM || model->arch == LLM_ARCH_LAGUNA) && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
        if (cparams.reduce_type == GGML_TYPE_F16) {
            const char * mname = model->arch == LLM_ARCH_MELLUM ? "Mellum" : "Laguna";
            LLAMA_LOG_WARN("=====================================================================\n");
            LLAMA_LOG_WARN("%s with split mode graph requires bf16 or f32 precision\n", mname);
            LLAMA_LOG_WARN("    => changing cparams.reduce_type to bf16\n");
            LLAMA_LOG_WARN("=====================================================================\n");
            cparams.reduce_type = GGML_TYPE_BF16;
        }
    }

    if (model->arch != LLM_ARCH_GLM4_MOE && model->arch != LLM_ARCH_QWEN35 &&
        model->arch != LLM_ARCH_QWEN35MOE && model->arch != LLM_ARCH_GEMMA4 &&
        model->arch != LLM_ARCH_GEMMA4_MTP && model->arch != LLM_ARCH_GLM_DSA &&
        model->arch != LLM_ARCH_GEMMA4_ASSISTANT &&
        cparams.mtp != 0) {
        cparams.mtp = 0;
    }

    cparams.mtp_op_type = params.mtp_op_type;

    LLAMA_LOG_INFO("%s: n_ctx         = %u\n",     __func__, cparams.n_ctx);
    LLAMA_LOG_INFO("%s: n_batch       = %u\n",     __func__, cparams.n_batch);
    LLAMA_LOG_INFO("%s: n_ubatch      = %u\n",     __func__, cparams.n_ubatch);
    LLAMA_LOG_INFO("%s: flash_attn    = %d\n",     __func__, cparams.flash_attn);
    if (model->is_mla_model()) {
    LLAMA_LOG_INFO("%s: mla_attn      = %d\n",     __func__, cparams.mla_attn);
    }
    LLAMA_LOG_INFO("%s: attn_max_b    = %d\n",     __func__, cparams.attn_max_batch);
    LLAMA_LOG_INFO("%s: fused_moe     = %d\n",     __func__, cparams.fused_moe_up_gate);
    LLAMA_LOG_INFO("%s: grouped er    = %d\n",     __func__, cparams.grouped_expert_routing);
    LLAMA_LOG_INFO("%s: fused_up_gate = %d\n",     __func__, cparams.fused_up_gate);
    LLAMA_LOG_INFO("%s: fused_mmad    = %d\n",     __func__, cparams.fused_mmad);
    LLAMA_LOG_INFO("%s: rope_cache    = %d\n",     __func__, cparams.rope_cache);
    LLAMA_LOG_INFO("%s: graph_reuse   = %d\n",     __func__, cparams.graph_reuse);
    LLAMA_LOG_INFO("%s: k_cache_hadam = %d\n",     __func__, cparams.k_cache_hadamard);
    LLAMA_LOG_INFO("%s: v_cache_hadam = %d\n",     __func__, cparams.v_cache_hadamard);
    LLAMA_LOG_INFO("%s: split_mode_graph_scheduling = %d\n",   __func__, cparams.split_mode_graph_scheduling);
    //LLAMA_LOG_INFO("%s: split_mode_f16= %d\n",     __func__, cparams.split_mode_f16);
    LLAMA_LOG_INFO("%s: reduce_type   = %s\n",     __func__, ggml_type_name(cparams.reduce_type));
    LLAMA_LOG_INFO("%s: sched_async   = %d\n",     __func__, cparams.scheduler_async);
    LLAMA_LOG_INFO("%s: ser           = %d, %g\n", __func__, cparams.min_experts, cparams.thresh_experts);
    LLAMA_LOG_INFO("%s: freq_base     = %.1f\n",   __func__, cparams.rope_freq_base);
    LLAMA_LOG_INFO("%s: freq_scale    = %g\n",     __func__, cparams.rope_freq_scale);
    if (cparams.cuda_params) {
        LLAMA_LOG_INFO("%s: cuda_params   = %s\n", __func__, (const char *)cparams.cuda_params);
    }

    ctx->abort_callback      = params.abort_callback;
    ctx->abort_callback_data = params.abort_callback_data;

    ctx->sampling.rng = std::mt19937(params.seed);
    ctx->logits_all   = params.logits_all;
    // build worst-case graph for encoder if a model contains encoder
    ctx->is_encoding  = llama_model_has_encoder(model);

    uint32_t kv_size = cparams.n_ctx;
    ggml_type type_k = params.type_k;
    ggml_type type_v = params.type_v;

    // Mamba only needs a constant number of KV cache cells per sequence
    if (model->arch == LLM_ARCH_MAMBA) {
        // Mamba needs at least as many KV cells as there are sequences kept at any time
        kv_size = std::max((uint32_t) 1, params.n_seq_max);
        // it's probably best to keep as much precision as possible for the states
        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
    }

    GGML_ASSERT(hparams.n_embd_head_k(0) % ggml_blck_size(type_k) == 0);
    GGML_ASSERT(hparams.n_embd_head_v(0) % ggml_blck_size(type_v) == 0);
    if (!hparams.vocab_only) {
        // initialize backends
        // main_gpu is a local index into model->devices throughout the codebase
        // (auto-fit assigns device_count-1, MTP clamps to [0, device_count), buffer-type
        // setup wraps with model.devices[main_gpu]). Translate to a raw device id here.
        const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
            ? model->devices[model->main_gpu]
            : model->main_gpu;
#if defined(GGML_USE_METAL)
        if (model->n_gpu_layers > 0) {
            ctx->backend_metal = ggml_backend_metal_init();
            if (ctx->backend_metal == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
                llama_free(ctx);
                return nullptr;
            }
            ggml_backend_add_from_device(ctx,  ctx->backend_metal);
        }
#elif defined(GGML_USE_CUDA)
        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
            ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params);
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, main_gpu_id);
                llama_free(ctx);
                return nullptr;
            }
            ggml_backend_add_from_device(ctx, backend);

        } else {
            // LLAMA_SPLIT_MODE_LAYER and LLAMA_SPLIT_MODE_GRAPH require a backend for each GPU
            auto params = cparams.cuda_params;
            std::string new_params;
            if (false && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
                static const std::string extra_string{"graphs=0"};
                if (params) new_params = std::string{(const char *)params} + ',';
                new_params += extra_string;
                params = new_params.data();
            }
            for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
                ggml_backend_t backend = ggml_backend_cuda_init(device, params);
                if (backend == nullptr) {
                    LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
                    llama_free(ctx);
                    return nullptr;
                }
                ggml_backend_add_from_device(ctx, backend);
            }
        }
#elif defined(GGML_USE_VULKAN)
        if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH || model->split_mode == LLAMA_SPLIT_MODE_ATTN) {
            LLAMA_LOG_ERROR("%s: split mode 'graph' or 'attn' not supported. Failed to initialize Vulkan backend\n", __func__);
            llama_free(ctx);
            return nullptr;
        }
        if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
            ggml_backend_t backend = ggml_backend_vk_init(main_gpu_id);
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
                llama_free(ctx);
                return nullptr;
            }
            ggml_backend_add_from_device(ctx, backend);
        } else {
            for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
                ggml_backend_t backend = ggml_backend_vk_init(device);
                if (backend == nullptr) {
                    LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
                    llama_free(ctx);
                    return nullptr;
                }
                ggml_backend_add_from_device(ctx, backend);
            }
        }
#elif defined(GGML_USE_SYCL)
        // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
        if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
            ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_id);
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu_id);
                llama_free(ctx);
                return nullptr;
            }
            ctx->backends.push_back(backend);
        } else {
            // LLAMA_SPLIT_LAYER requires a backend for each GPU
            for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
                ggml_backend_t backend = ggml_backend_sycl_init(i);
                if (backend == nullptr) {
                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
                    llama_free(ctx);
                    return nullptr;
                }
                ggml_backend_add_from_device(ctx, backend);
            }
        }
#elif defined(GGML_USE_KOMPUTE)
        if (model->n_gpu_layers > 0) {
            auto * backend = ggml_backend_kompute_init(main_gpu_id);
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
                llama_free(ctx);
                return nullptr;
            }
            ggml_backend_add_from_device(ctx, backend);
        }
#elif defined(GGML_USE_CANN)
    // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_GRAPH, only the main GPU backend is used
    // TODO: ggml_backend_cann is not support split tensor now, just leave code here.
    if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
        ggml_backend_t backend = ggml_backend_cann_init(main_gpu_id);
        if (backend == nullptr) {
            LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu_id);
            llama_free(ctx);
            return nullptr;
        }
        ggml_backend_add_from_device(ctx, backend);
    } else {
        // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
        // TODO: currently, CANN can't use multi-gpus, just leave code here for further cann version.
        for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
            ggml_backend_t backend = ggml_backend_cann_init(device);
            if (backend == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
                llama_free(ctx);
                return nullptr;
            }
            ggml_backend_add_from_device(ctx, backend);
        }
    }
#endif

#ifdef GGML_USE_BLAS
        ctx->backend_blas = ggml_backend_blas_init();
        if (ctx->backend_blas == nullptr) {
            LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
        } else {
            ggml_backend_add_from_device(ctx, ctx->backend_blas);
        }
#endif

#if defined(GGML_USE_RPC)
        if (model->n_gpu_layers > 0) {
            for (const auto & device : model->rpc_servers) {
                ggml_backend_t backend = ggml_backend_rpc_init(device.endpoint.c_str(), device.device);
                if (backend == nullptr) {
                    LLAMA_LOG_ERROR("%s: failed to initialize RPC%d to '%s'\n", __func__, device.device,
                        device.endpoint.c_str());
                    llama_free(ctx);
                    return nullptr;
                }
                ggml_backend_add_from_device(ctx, backend);
            }
        }
#endif
        if (ctx->cparams.devices.size()) {
            // reorder the backend from devices params
            std::vector<ggml_backend_t> backends = {};
            std::vector<const char*> device_list = {};
            for (auto device : ctx->cparams.devices) {
                ggml_backend_t backend = ctx->ggml_backend_by_name(device.c_str());
                if (backend) {
                    backends.push_back(backend);
                }
            }
            ctx->backends = std::move(backends);
        }

        ctx->backend_cpu = ggml_backend_cpu_init();
        if (ctx->backend_cpu == nullptr) {
            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
            llama_free(ctx);
            return nullptr;
        }
        ctx->backends.push_back(ctx->backend_cpu);

        if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv,
                    params.type_k_first, params.type_k_last, params.type_v_first, params.type_v_last,
                    params.n_k_first, params.n_k_last, params.n_v_first, params.n_v_last)) {
            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
            return nullptr;
        }

        {
            size_t memory_size_k = 0;
            size_t memory_size_v = 0;

            for (auto & k : ctx->kv_self.k_l) {
                if (k) {
                    memory_size_k += ggml_nbytes(k);
                }
            }

            for (auto & v : ctx->kv_self.v_l) {
                if (v) {
                    memory_size_v += ggml_nbytes(v);
                }
            }

            if (memory_size_k + memory_size_v > 0) {
	        if (cparams.mla_attn != 0 && !cparams.flash_attn) {
                    LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__,
                            (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                            ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                            ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
                } else if (cparams.mla_attn != 0 && cparams.flash_attn) {
                    LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T: not used\n", __func__,
                            (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                            ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f));
                } else {
                    LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                            (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                            ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                            ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
                }
            }
        }

        // graph outputs buffer
        {
            // resized during inference when a batch uses more outputs
            if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
                LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
                llama_free(ctx);
                return nullptr;
            }

            LLAMA_LOG_INFO("%s: %10s  output buffer size = %8.2f MiB\n", __func__,
                    ggml_backend_buffer_name(ctx->buf_output),
                    ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
        }

        // scheduler and compute buffers
        {
            // buffer types used for the compute buffer of each backend
            std::vector<ggml_backend_buffer_type_t> backend_buft;
            for (auto * backend : ctx->backends) {
                if (ggml_backend_is_cpu(backend)) {
                    // use host buffers for the CPU backend compute buffer
                    backend_buft.push_back(llama_default_buffer_type_cpu(true));
                } else {
                    backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
                }
            }

            int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
            const size_t max_nodes = ctx->max_nodes(n_tokens, cparams.n_ctx);

            // buffer used to store the computation graph and the tensor meta data
            ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));

            // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
            bool pipeline_parallel =
                llama_get_device_count(*model) > 1 &&
                model->n_gpu_layers > (int)model->hparams.n_layer &&
                model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
                params.offload_kqv && !model->has_tensor_overrides();
#ifndef GGML_USE_CUDA
            // pipeline parallelism requires support for async compute and events
            // currently this is only implemented in the CUDA backend
            pipeline_parallel = false;
#endif
            ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);

            if (pipeline_parallel) {
                LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
            }

            llama_repack_up_gate_exps(*ctx);

            // build worst-case graph
            int n_past = cparams.n_ctx - n_tokens;
            llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
            ggml_cgraph * gf = llm_build_context::llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true, cparams.worst_graph_tokens);

            // initialize scheduler with the worst-case graph
            bool gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
            if (!gf_success)
            {
                if (pipeline_parallel) {
                    LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
                    ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, false);
                    gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
                }
                if (!gf_success) {
                    LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                    llama_free(ctx);
                    return nullptr;
                }
            }

            for (size_t i = 0; i < ctx->backends.size(); i++) {
                ggml_backend_t backend = ctx->backends[i];
                ggml_backend_buffer_type_t buft = backend_buft[i];
                size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
                if (size > 1) {
                    LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
                            ggml_backend_buft_name(buft),
                            size / 1024.0 / 1024.0);
                }
            }

            // note: the number of splits during measure is higher than during inference due to the kv shift
            int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes);
            LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
        }
    }

    if (params.offload_policy) {
        const std::vector<std::pair<int, int>>& policy = *(const std::vector<std::pair<int, int>>*)params.offload_policy;
        for (auto [op, on_off] : policy) {
            if (op < 0 || op >= int(GGML_OP_COUNT)) {
                LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for all ops to %s\n", on_off ? "ON" : "OFF");
            } else {
                LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for op %s to %s\n",
                        ggml_op_name(ggml_op(op)), on_off ? "ON" : "OFF");
            }
            ggml_backend_sched_set_op_offload(ctx->sched, ggml_op(op), on_off);
        }
    }

    if (params.only_active_experts) {
        LLAMA_LOG_INFO("%s: enabling only_active_experts scheduling\n", __func__);
        ggml_backend_sched_set_only_active_experts(ctx->sched, true);
    }
    if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && (!model->has_tensor_overrides() || cparams.split_mode_graph_scheduling)) {
        ggml_backend_sched_set_split_mode_graph(ctx->sched, true, cparams.scheduler_async);
        ggml_backend_sched_set_max_extra_alloc(ctx->sched, params.max_extra_alloc);
        if (model->has_tensor_overrides() && cparams.split_mode_graph_scheduling) {
            LLAMA_LOG_INFO("XXXXXXXX Split Mode Graph Scheduling is FORCED despite tensor overrides due to user choice.\n");
            LLAMA_LOG_INFO("XXXXXXXX It may or might NOT infer properly due to unsupported combinations between SMGS and every possible tensor overrides.\n");
        }
    }

    if (cparams.mtp && hparams.nextn_predict_layers > 0) {
        const auto n_batch = cparams.n_batch;
        const auto n_vocab = hparams.n_vocab;
        const auto n_embd  = hparams.n_embd;

        const size_t logits_size = n_vocab*n_batch;
        const size_t embd_size   = n_embd*n_batch;

        if (ctx->output_ids.empty()) {
            // init, never resized afterwards
            ctx->output_ids.resize(n_batch);
        }

        const size_t prev_size = ctx->buf_output ? ggml_backend_buffer_get_size(ctx->buf_output) : 0;
        const size_t new_size  = (logits_size + embd_size) * sizeof(float);

        // alloc only when more than the current capacity is required
        if (!ctx->buf_output || prev_size < new_size) {
            if (ctx->buf_output) {
                ggml_backend_buffer_free(ctx->buf_output);
                ctx->buf_output = nullptr;
                ctx->logits = nullptr;
                ctx->embd = nullptr;
            }

            ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
            if (ctx->buf_output == nullptr) {
                LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
            }
        }
    }

    return ctx;
}

void llama_free(struct llama_context * ctx) {
    delete ctx;
}

const struct llama_vocab* llama_model_get_vocab(const struct llama_model* model) {
    return &model->vocab;
}

const struct llama_model * llama_get_model(const struct llama_context * ctx) {
    return &ctx->model;
}

const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
    return &ctx->model.vocab;
}

uint32_t llama_n_ctx(const struct llama_context * ctx) {
    return ctx->cparams.n_ctx;
}

uint32_t llama_n_batch(const struct llama_context * ctx) {
    return ctx->cparams.n_batch;
}

uint32_t llama_n_ubatch(const struct llama_context * ctx) {
    return ctx->cparams.n_ubatch;
}

uint32_t llama_n_seq_max(const struct llama_context * ctx) {
    return ctx->kv_self.size;
}

enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
    return vocab->get_type();
}

const struct llama_vocab* llama_get_model_vocab(const struct llama_model* model) {
    return &model->vocab;
}

enum llama_rope_type llama_rope_type(const struct llama_model * model) {
    switch (model->arch) {
        // these models do not use RoPE
        case LLM_ARCH_GPT2:
        case LLM_ARCH_GPTJ:
        case LLM_ARCH_MPT:
        case LLM_ARCH_REFACT:
        case LLM_ARCH_BLOOM:
        case LLM_ARCH_MAMBA:
        case LLM_ARCH_JINA_BERT_V2:
        case LLM_ARCH_T5:
        case LLM_ARCH_T5ENCODER:
        case LLM_ARCH_JAIS:
            return LLAMA_ROPE_TYPE_NONE;

        // use what we call a normal RoPE, operating on pairs of consecutive head values
        case LLM_ARCH_LLAMA:
        case LLM_ARCH_DECI:
        case LLM_ARCH_LLAMA4:
        case LLM_ARCH_BAICHUAN:
        case LLM_ARCH_STARCODER:
        case LLM_ARCH_PLAMO:
        case LLM_ARCH_ORION:
        case LLM_ARCH_INTERNLM2:
        case LLM_ARCH_MINICPM:
        case LLM_ARCH_XVERSE:
        case LLM_ARCH_COMMAND_R:
        case LLM_ARCH_OLMO:
        case LLM_ARCH_ARCTIC:
        case LLM_ARCH_DEEPSEEK2:
        case LLM_ARCH_CHATGLM:
        case LLM_ARCH_GLM4:
        case LLM_ARCH_GRANITE:
        case LLM_ARCH_GRANITE_MOE:
        case LLM_ARCH_COHERE2:
        case LLM_ARCH_COHERE2_MOE:
        case LLM_ARCH_ERNIE4_5:
        case LLM_ARCH_ERNIE4_5_MOE:
        case LLM_ARCH_SMOLLM3:
        case LLM_ARCH_MISTRAL3:
        case LLM_ARCH_GLM_DSA:
        case LLM_ARCH_MISTRAL4:
            return LLAMA_ROPE_TYPE_NORM;

        // the pairs of head values are offset by n_rot/2
        case LLM_ARCH_FALCON:
        case LLM_ARCH_GROK:
        case LLM_ARCH_DBRX:
        case LLM_ARCH_BERT:
        case LLM_ARCH_NOMIC_BERT:
        case LLM_ARCH_STABLELM:
        case LLM_ARCH_GLM4_MOE:
        case LLM_ARCH_BITNET:
        case LLM_ARCH_BITNET_25:
        case LLM_ARCH_BITNET_B158:
        case LLM_ARCH_QWEN:
        case LLM_ARCH_QWEN2:
        case LLM_ARCH_QWEN2MOE:
        case LLM_ARCH_QWEN3:
        case LLM_ARCH_QWEN3MOE:
        case LLM_ARCH_MELLUM:
        case LLM_ARCH_QWEN3NEXT:
        case LLM_ARCH_PHI2:
        case LLM_ARCH_PHI3:
        case LLM_ARCH_GEMMA:
        case LLM_ARCH_GEMMA2:
        case LLM_ARCH_GEMMA3:
        case LLM_ARCH_STARCODER2:
        case LLM_ARCH_OPENELM:
        case LLM_ARCH_GPTNEOX:
        case LLM_ARCH_CODESHELL:
        case LLM_ARCH_DOTS1:
        case LLM_ARCH_HUNYUAN_MOE:
        case LLM_ARCH_OPENAI_MOE:
        case LLM_ARCH_BAILINGMOE2:
        case LLM_ARCH_MINIMAX_M2:
        case LLM_ARCH_MIMO2:
        case LLM_ARCH_SEED_OSS:
        case LLM_ARCH_STEP35:
        case LLM_ARCH_LAGUNA:
        case LLM_ARCH_GEMMA4:
        case LLM_ARCH_GEMMA4_MTP:
        case LLM_ARCH_GEMMA4_ASSISTANT:
            return LLAMA_ROPE_TYPE_NEOX;

        case LLM_ARCH_QWEN2VL:
            return LLAMA_ROPE_TYPE_MROPE;

        case LLM_ARCH_QWEN3VL:
        case LLM_ARCH_QWEN3VLMOE:
        case LLM_ARCH_QWEN35MOE:
        case LLM_ARCH_QWEN35:
            return LLAMA_ROPE_TYPE_IMROPE;

        // all model arches should be listed explicitly here
        case LLM_ARCH_UNKNOWN:
            GGML_ABORT("unknown architecture");
    }

    return LLAMA_ROPE_TYPE_NONE;
}

enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
    return ctx->cparams.pooling_type;
}

int32_t llama_n_vocab(const struct llama_model * model) {
    return model->hparams.n_vocab;
}

int32_t llama_n_ctx_train(const struct llama_model * model) {
    return model->hparams.n_ctx_train;
}

int32_t llama_model_n_embd(const struct llama_model * model) {
    return model->hparams.n_embd;
}

int32_t llama_model_n_embd_inp(const llama_model* model) {
    return model->hparams.n_embd_inp();
}

int32_t llama_n_layer(const struct llama_model * model) {
    return model->hparams.n_layer;
}

float llama_rope_freq_scale_train(const struct llama_model * model) {
    return model->hparams.rope_freq_scale_train;
}

int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
    const auto & it = model->gguf_kv.find(key);
    if (it == model->gguf_kv.end()) {
        if (buf_size > 0) {
            buf[0] = '\0';
        }
        return -1;
    }
    return snprintf(buf, buf_size, "%s", it->second.c_str());
}

int32_t llama_model_meta_count(const struct llama_model * model) {
    return (int)model->gguf_kv.size();
}

int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
        if (buf_size > 0) {
            buf[0] = '\0';
        }
        return -1;
    }
    auto it = model->gguf_kv.begin();
    std::advance(it, i);
    return snprintf(buf, buf_size, "%s", it->first.c_str());
}

int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
    if (i < 0 || i >= (int)model->gguf_kv.size()) {
        if (buf_size > 0) {
            buf[0] = '\0';
        }
        return -1;
    }
    auto it = model->gguf_kv.begin();
    std::advance(it, i);
    return snprintf(buf, buf_size, "%s", it->second.c_str());
}

int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
    return snprintf(buf, buf_size, "%s %s %s",
            llama_model_arch_name(model->arch),
            llama_model_type_name(model->type),
            llama_model_ftype_name(model->ftype).c_str());
}

uint64_t llama_model_size(const struct llama_model * model) {
    uint64_t size = 0;
    for (const auto & it : model->tensors_by_name) {
        size += ggml_nbytes(it.second);
    }
    return size;
}

const char* llama_model_chat_template(const struct llama_model* model, const char* name) {
    const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
        : LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
    const auto& it = model->gguf_kv.find(key);
    if (it == model->gguf_kv.end()) {
        // one-off fix for very popular models (so we are not flooded with issues)
        // do not extend this list unless absolutely necessary
        // Mistral-Small-2503 does not have built-in chat template
        llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
        if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
            return "mistral-v7-tekken";
        }

        return nullptr;
    }
    return it->second.c_str();
}

uint64_t llama_model_n_params(const struct llama_model * model) {
    uint64_t nparams = 0;
    for (const auto & it : model->tensors_by_name) {
        nparams += ggml_nelements(it.second);
    }
    return nparams;
}

struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
                return it.first == name;
            });
    if (it == model->tensors_by_name.end()) {
        return nullptr;
    }
    return it->second;
}

bool llama_model_has_encoder(const struct llama_model * model) {
    switch (model->arch) {
        case LLM_ARCH_T5:        return true;
        case LLM_ARCH_T5ENCODER: return true;
        default:                 return false;
    }
}

bool llama_model_has_decoder(const struct llama_model * model) {
    switch (model->arch) {
        case LLM_ARCH_T5ENCODER: return false;
        default:                 return true;
    }
}

llama_token llama_model_decoder_start_token(const struct llama_model * model) {
    return model->hparams.dec_start_token_id;
}

struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
    try {
        struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
        llama_lora_adapter_init_internal(model, path_lora, *adapter);
        return adapter;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
        return nullptr;
    }
}

static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
    GGML_ASSERT(cvec.tensors.empty());
    GGML_ASSERT(cvec.ctxs.empty());
    GGML_ASSERT(cvec.bufs.empty());

    // count layer buffer types
    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
    for (int64_t i = 0; i < model.hparams.n_layer; i++) {
        buft_layer_count[model.buft_layer[i].buft]++;
    }

    // allocate contexts
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    for (auto & it : buft_layer_count) {
        int n_layers = it.second;
        struct ggml_init_params params = {
            /*.mem_size   =*/ n_layers * ggml_tensor_overhead(),
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
        ggml_context * ctx = ggml_init(params);
        if (!ctx) {
            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
            return 1;
        }
        ctx_map[it.first] = ctx;
    }

    // make tensors
    cvec.tensors.reserve(model.hparams.n_layer);
    cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
    for (size_t il = 1; il < model.hparams.n_layer; il++) {
        struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
        cvec.tensors.push_back(tensor);
    }

    // allocate tensors / buffers and zero
    cvec.ctxs.reserve(ctx_map.size());
    cvec.bufs.reserve(ctx_map.size());
    for (auto it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
        ggml_context * ctx = it.second;
        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
        if (!buf) {
            LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
            return false;
        }
        ggml_backend_buffer_clear(buf, 0);
        cvec.ctxs.push_back(ctx);
        cvec.bufs.push_back(buf);
    }

    return true;
}

int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
    const llama_model & model = lctx->model;
    llama_control_vector & cvec = lctx->cvec;

    if (data == nullptr) {
        // disable the current control vector (but leave allocated for later)
        cvec.layer_start = -1;
        cvec.layer_end   = -1;
        return 0;
    }

    if (n_embd != (int) model.hparams.n_embd) {
        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
        return 1;
    }

    if (cvec.tensors.empty()) {
        if (!llama_control_vector_init(cvec, model)) {
            return 1;
        }
    }

    cvec.layer_start = il_start;
    cvec.layer_end   = il_end;

    for (size_t il = 1; il < model.hparams.n_layer; il++) {
        assert(cvec.tensors[il] != nullptr);

        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
        if (off + n_embd <= len) {
            ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
        }
    }

    return 0;
}

struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
    struct llama_kv_cache_view result = {
        /*.n_cells            = */ 0,
        /*.n_seq_max          = */ n_seq_max,
        /*.token_count        = */ 0,
        /*.used_cells         = */ llama_get_kv_cache_used_cells(ctx),
        /*.max_contiguous     = */ 0,
        /*.max_contiguous_idx = */ -1,
        /*.cells              = */ nullptr,
        /*.cells_sequences    = */ nullptr,
    };
    return result;
}

void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
    if (view->cells != nullptr) {
        free(view->cells);
        view->cells = nullptr;
    }
    if (view->cells_sequences != nullptr) {
        free(view->cells_sequences);
        view->cells_sequences = nullptr;
    }
}

void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
    if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
        view->n_cells = int32_t(ctx->kv_self.size);
        void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
        view->cells = (struct llama_kv_cache_view_cell *)p;
        p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
        GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
        view->cells_sequences = (llama_seq_id *)p;
    }

    const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
    llama_kv_cache_view_cell * c_curr = view->cells;
    llama_seq_id * cs_curr = view->cells_sequences;
    int32_t used_cells = 0;
    int32_t token_count = 0;
    int32_t curr_contig_idx = -1;
    uint32_t max_contig = 0;
    int32_t max_contig_idx = -1;

    for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) {
        const size_t curr_size = kv_cells[i].seq_id.size();
        token_count += curr_size;
        c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;

        if (curr_size > 0) {
            if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
                max_contig = i - curr_contig_idx;
                max_contig_idx = curr_contig_idx;
            }
            curr_contig_idx = -1;
        } else if (curr_contig_idx < 0) {
            curr_contig_idx = i;
        }

        int seq_idx = 0;
        for (const llama_seq_id it : kv_cells[i].seq_id) {
            if (seq_idx >= view->n_seq_max) {
                break;
            }
            cs_curr[seq_idx] = it;
            seq_idx++;
        }
        if (seq_idx != 0) {
            used_cells++;
        }
        for (; seq_idx < view->n_seq_max; seq_idx++) {
            cs_curr[seq_idx] = -1;
        }
    }
    if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
        max_contig_idx = curr_contig_idx;
        max_contig = kv_cells.size() - curr_contig_idx;
    }
    view->max_contiguous = max_contig;
    view->max_contiguous_idx = max_contig_idx;
    view->token_count = token_count;
    view->used_cells = used_cells;
    if (uint32_t(used_cells) != ctx->kv_self.used) {
        LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
            __func__, ctx->kv_self.used, used_cells);
    }
}

int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
    int result = 0;

    for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
        result += ctx->kv_self.cells[i].seq_id.size();
    }

    return result;
}

int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
    return ctx->kv_self.used;
}

void llama_kv_cache_clear(struct llama_context * ctx) {
    llama_kv_cache_clear(ctx->kv_self);
}

// Unified speculative-checkpoint
static bool spec_ckpt_try_per_step(llama_kv_cache & kv, const llama_model & model, int max_tokens) {
    // Split recurrent tensors are supported as long as each layer exposes
    // concrete backend buffers for the per-step tensors. CPU-only and mixed
    // CPU/GPU recurrent placement are also allowed.
    bool has_gpu = false;
    bool has_cpu = false;
    for (const auto * sl : kv.s_l) {
        if (!sl) continue;
        if (sl->extra) {
            has_gpu = true;
            continue;
        }
        if (sl->buffer && !ggml_backend_buffer_is_host(sl->buffer)) {
            has_gpu = true;
        } else if (sl->buffer) {
            has_cpu = true;
        }
    }
    if (!has_gpu && !has_cpu) {
        kv.save_per_step_ssm = false;
        return false;
    }

    // Populate per-step dimensions from hparams
    if (kv.ckpt.per_step_ssm_state_size <= 0) {
        const auto & hp       = model.hparams;
        const int64_t nv      = hp.ssm_dt_rank;
        const int64_t head_v  = hp.ssm_d_inner / nv;
        const int64_t head_k  = hp.ssm_d_state;
        const int64_t nk      = hp.ssm_n_group;
        const int64_t key_dim = head_k * nk;
        const int64_t val_dim = head_v * nv;
        const int64_t conv_dim = key_dim * 2 + val_dim;

        kv.ckpt.per_step_ssm_state_size = head_v * head_v * nv;
        kv.ckpt.per_step_conv_state_dim = (hp.ssm_d_conv - 1) * conv_dim;
        kv.ckpt.per_step_conv_dim       = conv_dim;
        kv.ckpt.per_step_d_conv         = hp.ssm_d_conv;
    }

    if (!kv.per_step_alloc(model, max_tokens)) {
        kv.save_per_step_ssm = false;
        return false;
    }

    if (!kv.checkpoint_alloc_shadows(true)) {
        LLAMA_LOG_ERROR("%s: failed to allocate conv-state shadow buffers for per-step checkpoints\n", __func__);
        kv.save_per_step_ssm = false;
        return false;
    }

    return true;
}

static size_t llama_spec_ckpt_cpu_state_reserve(const llama_context * ctx, llama_seq_id seq_id) {
    const auto & kv_self = ctx->kv_self;

    size_t size = sizeof(uint32_t); // cell_count

    if (seq_id >= 0 && llama_kv_qnext_seq_id_in_range(kv_self, seq_id) && (uint32_t) seq_id < kv_self.size) {
        size += sizeof(llama_pos);
        size += sizeof(uint32_t); // n_seq_id = 0 for seq-specific saves
    }

    const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0;
    const uint32_t n_layer = kv_self.k_l.size();

    size += sizeof(v_state);
    size += sizeof(n_layer);
    size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint64_t));

    if (v_state == 0) {
        size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint64_t));
    } else if (v_state == 1) {
        size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint32_t) + sizeof(uint32_t));
    }

    const uint32_t qnext_state = llama_kv_has_qnext_state_storage(kv_self) ? 1 : 0;
    size += sizeof(qnext_state);

    if (qnext_state != 0) {
        for (uint32_t il = 0; il < n_layer; ++il) {
            const bool has_s_cache = il < kv_self.s_l.size() && kv_self.s_l[il] != nullptr;
            const uint64_t s_size_row = has_s_cache ? ggml_row_size(kv_self.s_l[il]->type, kv_self.s_l[il]->ne[0]) : 0;
            const uint32_t s_rows = has_s_cache && seq_id >= 0 && llama_kv_qnext_seq_id_in_range(kv_self, seq_id) && (uint32_t) seq_id < kv_self.size
                ? 1
                : 0;

            size += sizeof(int32_t);
            size += sizeof(uint64_t);
            size += sizeof(uint32_t);
            size += (size_t) s_rows * s_size_row;
        }
    }
    return size;
}

static const char * llama_spec_ckpt_mode_name(int mode) {
    switch (mode) {
        case LLAMA_SPEC_CKPT_PER_STEP:
            return "per-step";
        case LLAMA_SPEC_CKPT_GPU_FALLBACK:
            return "gpu-fallback";
        case LLAMA_SPEC_CKPT_CPU:
            return "cpu";
        case LLAMA_SPEC_CKPT_AUTO:
            return "auto";
        default:
            return "none";
    }
}

int llama_spec_ckpt_init(struct llama_context * ctx, int mode, int max_tokens) {
    auto & kv = ctx->kv_self;

    kv.save_per_step_ssm     = false;
    kv.ckpt.selected_spec_mode = LLAMA_SPEC_CKPT_NONE;

    if (!kv.checkpoint_supported()) {
        return (int)LLAMA_SPEC_CKPT_NONE;
    }

    if (kv.ckpt.fixed_spec_mode != LLAMA_SPEC_CKPT_NONE) {
        if (kv.ckpt.fixed_spec_mode == LLAMA_SPEC_CKPT_PER_STEP && max_tokens > kv.ckpt.fixed_max_tokens) {
            LLAMA_LOG_WARN("%s: fixed per-step checkpoint capacity is %d tokens, but the current speculative batch requests %d; disabling checkpoint for this batch\n",
                    __func__, kv.ckpt.fixed_max_tokens, max_tokens);
            return (int)LLAMA_SPEC_CKPT_NONE;
        }

        kv.ckpt.selected_spec_mode = kv.ckpt.fixed_spec_mode;
        return kv.ckpt.selected_spec_mode;
    }

    int requested = mode;
    int resolved = LLAMA_SPEC_CKPT_NONE;

    // prefer PER_STEP → GPU_FALLBACK → CPU
    if (requested == LLAMA_SPEC_CKPT_AUTO) {
        requested = LLAMA_SPEC_CKPT_PER_STEP;
    }

    if (requested == LLAMA_SPEC_CKPT_PER_STEP) {
        if (spec_ckpt_try_per_step(kv, ctx->model, max_tokens)) {
            resolved = LLAMA_SPEC_CKPT_PER_STEP;
        } else if (mode == LLAMA_SPEC_CKPT_PER_STEP) {
            LLAMA_LOG_ERROR("%s: failed to preallocate per-step checkpoint buffers for max_tokens=%d; --recurrent-ckpt-mode=%s requires startup allocation\n",
                    __func__, max_tokens, llama_spec_ckpt_mode_name(mode));
            return (int)LLAMA_SPEC_CKPT_NONE;
        } else {
            LLAMA_LOG_WARN("%s: auto checkpoint mode could not preallocate per-step buffers for max_tokens=%d; falling back to gpu-fallback\n",
                    __func__, max_tokens);
            requested = LLAMA_SPEC_CKPT_GPU_FALLBACK;
        }
    }

    if (resolved == LLAMA_SPEC_CKPT_NONE && requested == LLAMA_SPEC_CKPT_GPU_FALLBACK) {
        if (kv.checkpoint_alloc_shadows()) {
            resolved = LLAMA_SPEC_CKPT_GPU_FALLBACK;
        } else if (mode == LLAMA_SPEC_CKPT_GPU_FALLBACK) {
            LLAMA_LOG_ERROR("%s: failed to preallocate gpu-fallback checkpoint shadows at startup; --recurrent-ckpt-mode=%s requires startup allocation\n",
                    __func__, llama_spec_ckpt_mode_name(mode));
            return (int)LLAMA_SPEC_CKPT_NONE;
        } else {
            LLAMA_LOG_WARN("%s: auto checkpoint mode could not preallocate gpu-fallback checkpoint shadows; falling back to cpu\n",
                    __func__);
            requested = LLAMA_SPEC_CKPT_CPU;
        }
    }

    if (resolved == LLAMA_SPEC_CKPT_NONE) {
        resolved = LLAMA_SPEC_CKPT_CPU;
    }

    if (resolved == LLAMA_SPEC_CKPT_CPU) {
        const size_t cpu_reserve = llama_spec_ckpt_cpu_state_reserve(ctx, 0);
        kv.ckpt.cpu_state_data.clear();
        kv.ckpt.cpu_state_data.reserve(cpu_reserve);
        LLAMA_LOG_INFO("%s: CPU serialized checkpoint reserve = %8.2f MiB (per seq)\n",
                __func__, cpu_reserve / 1024.0 / 1024.0);
    }

    kv.ckpt.fixed_spec_mode = resolved;
    kv.ckpt.fixed_max_tokens = resolved == LLAMA_SPEC_CKPT_PER_STEP ? max_tokens : 0;
    kv.ckpt.selected_spec_mode = resolved;

    LLAMA_LOG_INFO("%s: fixed recurrent checkpoint mode = %s%s\n",
            __func__, llama_spec_ckpt_mode_name(resolved),
            resolved == LLAMA_SPEC_CKPT_PER_STEP ? (std::string(" (max_tokens=") + std::to_string(max_tokens) + ")").c_str() : "");

    return resolved;
}

bool llama_spec_ckpt_save(struct llama_context * ctx, llama_seq_id seq_id) {
    auto & kv = ctx->kv_self;

    switch (kv.ckpt.selected_spec_mode) {
        case LLAMA_SPEC_CKPT_PER_STEP:
            kv.save_per_step_ssm = true;
            return true;

        case LLAMA_SPEC_CKPT_GPU_FALLBACK:
            return kv.checkpoint_save(ctx->sched);

        case LLAMA_SPEC_CKPT_CPU: {
            const size_t need = llama_state_seq_get_size(ctx, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
            kv.ckpt.cpu_state_data.resize(need);
            const size_t written = llama_state_seq_get_data(
                ctx, kv.ckpt.cpu_state_data.data(), need, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
            kv.ckpt.cpu_state_data.resize(written);
            return written > 0;
        }

        default:
            return false;
    }
}

bool llama_spec_ckpt_restore(struct llama_context * ctx, llama_seq_id seq_id,
                              llama_pos n_past, int accepted_step) {
    auto & kv = ctx->kv_self;

    switch (kv.ckpt.selected_spec_mode) {
        case LLAMA_SPEC_CKPT_PER_STEP: {
            if (!kv.per_step_restore(ctx->model, ctx->sched, accepted_step)) {
                return false;
            }
            const llama_pos accepted_pos = n_past + accepted_step;
            if (seq_id >= 0 && (uint32_t)seq_id < kv.size) {
                kv.cells[seq_id].pos = accepted_pos;
            }
            llama_kv_cache_seq_rm(kv, seq_id, accepted_pos + 1, -1);
            return true;
        }

        case LLAMA_SPEC_CKPT_GPU_FALLBACK:
            kv.checkpoint_restore(ctx->sched);
            llama_kv_cache_seq_rm(kv, seq_id, n_past, -1);
            return false;

        case LLAMA_SPEC_CKPT_CPU:
            if (!kv.ckpt.cpu_state_data.empty()) {
                llama_state_seq_set_data(ctx, kv.ckpt.cpu_state_data.data(),
                                         kv.ckpt.cpu_state_data.size(), seq_id,
                                         LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
            }
            llama_kv_cache_seq_rm(kv, seq_id, n_past, -1);
            return false;

        default:
            return false;
    }
}

void llama_spec_ckpt_discard(struct llama_context * ctx) {
    auto & kv = ctx->kv_self;

    if (kv.ckpt.selected_spec_mode == LLAMA_SPEC_CKPT_PER_STEP) {
        kv.save_per_step_ssm = false;
        kv.checkpoint_delete();
    } else if (kv.ckpt.selected_spec_mode == LLAMA_SPEC_CKPT_GPU_FALLBACK) {
        kv.checkpoint_delete();
    }

    kv.ckpt.selected_spec_mode = LLAMA_SPEC_CKPT_NONE;
    kv.ckpt.cpu_state_data.clear();
}

bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
    return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
}

void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
    if (seq_id_src == seq_id_dst) {
        return;
    }
    llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
}

void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
    llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
}

void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
    if (delta == 0) {
        return;
    }

    llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
}

void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
    if (d == 1) {
        return;
    }

    llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
}

llama_pos llama_kv_cache_seq_pos_min(struct llama_context * ctx, llama_seq_id seq_id) {
    if (ctx->kv_self.hybrid || ctx->kv_self.recurrent) {
        return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
    }
    return llama_kv_cache_seq_pos_min(ctx->kv_self, seq_id);
}

llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
    return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
}

void llama_kv_cache_defrag(struct llama_context * ctx) {
    llama_kv_cache_defrag(ctx->kv_self);
}

int32_t llama_kv_cache_update(struct llama_context * ctx) {
    return llama_kv_cache_update_internal(*ctx);
}

// deprecated
size_t llama_get_state_size(struct llama_context * ctx) {
    return llama_state_get_size(ctx);
}

// deprecated
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
    return llama_state_get_data(ctx, dst, -1);
}

// deprecated
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
    return llama_state_set_data(ctx, src, -1);
}

// deprecated
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
    return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
}

// deprecated
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
    return llama_state_save_file(ctx, path_session, tokens, n_token_count);
}

static inline ggml_tensor * get_kv_cache_split_tensor(const ggml_tensor * tensor, const llama_layer & l) {
    if (!l.wv) return l.wk;
    bool use_V_for_K = l.attn_k_norm && l.attn_k_norm->ne[0] == l.wk->ne[1] ? true : false;
    auto kv = tensor->ne[1] > 1 && !use_V_for_K ? l.wk : l.wv;
    return kv;
}

// TODO: replace all non-fatal assertions with returned errors or exceptions
struct llama_data_write {
    virtual void write(const void * src, size_t size) = 0;
    virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size, int il) = 0;
    virtual size_t get_size_written() = 0;
    virtual ~llama_data_write() = default;

    void write_string(const std::string & str) {
        uint32_t str_size = str.size();

        write(&str_size,  sizeof(str_size));
        write(str.data(), str_size);
    }

    void write_model_info(const struct llama_context * ctx) {
        std::string arch_str = llama_model_arch_name(ctx->model.arch);
        write_string(arch_str);
        // TODO: add more model-specific info which should prevent loading the session file if not identical
    }

    void write_rng(const std::mt19937 & rng) {
        std::ostringstream rng_ss;
        rng_ss << rng;

        const std::string & rng_str = rng_ss.str();

        write_string(rng_str);
    }

    void write_output_ids(const struct llama_context * ctx) {
        const uint32_t n_outputs = ctx->n_outputs;

        std::vector<int32_t> output_pos;

        const size_t    n_batch = ctx->cparams.n_batch;
        const auto & output_ids = ctx->output_ids;

        GGML_ASSERT(n_outputs <= ctx->output_size);

        output_pos.resize(n_outputs);

        // build a more compact representation of the output ids
        for (size_t i = 0; i < n_batch; ++i) {
            // map an output id to a position in the batch
            int32_t pos = output_ids[i];
            if (pos >= 0) {
                GGML_ASSERT((uint32_t) pos < n_outputs);
                output_pos[pos] = i;
            }
        }

        write(&n_outputs, sizeof(n_outputs));

        if (n_outputs) {
            write(output_pos.data(), n_outputs * sizeof(int32_t));
        }
    }

    void write_logits(const struct llama_context * ctx) {
        const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);

        write(&logits_size, sizeof(logits_size));

        if (logits_size) {
            write(ctx->logits, logits_size * sizeof(float));
        }
    }

    void write_embeddings(const struct llama_context * ctx) {
        const uint64_t row_width = llama_output_embd_width(*ctx);
        const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs_embd * row_width);

        write(&embeddings_size, sizeof(embeddings_size));

        if (embeddings_size) {
            write(ctx->embd, embeddings_size * sizeof(float));
        }
    }

    void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {

        for (const auto & range : cell_ranges) {
            for (uint32_t i = range.first; i < range.second; ++i) {
                const auto & cell = kv_self.cells[i];
                const llama_pos pos      = cell.pos;
                const uint32_t  n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;

                write(&pos,      sizeof(pos));
                write(&n_seq_id, sizeof(n_seq_id));

                if (n_seq_id) {
                    for (auto seq_id : cell.seq_id) {
                        write(&seq_id, sizeof(seq_id));
                    }
                }
            }
        }
    }

    void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1,
        llama_state_seq_flags flags = 0) {
        const struct llama_kv_cache & kv_self = ctx->kv_self;
        const struct llama_hparams & hparams = ctx->model.hparams;
        bool need_kv = (flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0;
        // v_state: 0 -> not transposed V cache
        //          1 -> transposed V cache
        //          2 -> no V cache (as it may be the case with MLA)
        const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0;
        const uint32_t n_layer = kv_self.k_l.size();

        write(&v_state, sizeof(v_state));
        write(&n_layer, sizeof(n_layer));

        // Iterate and write all the keys first, each row is a cell
        // Get whole range at a time
        for (uint32_t il = 0; il < n_layer; ++il) {
            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
            const uint32_t n_embd_head_qk_rope = hparams.n_rot;
            const uint32_t kv_lora_rank = hparams.n_lora_kv;
            const bool has_k_cache = kv_self.k_l[il] != nullptr && need_kv;

            // Write key type
            const int32_t k_type_i = has_k_cache ? (int32_t) kv_self.k_l[il]->type : -1;
            write(&k_type_i, sizeof(k_type_i));

            // Write row size of key
            const uint64_t k_size_row = has_k_cache
                ? ((ctx->cparams.mla_attn == 0)
                    ? ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)
                    : ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope))
                : 0;
            write(&k_size_row, sizeof(k_size_row));

            if (!has_k_cache) {
                continue;
            }

            // Read each range of cells of k_size length each into tmp_buf and write out
            for (const auto & range : cell_ranges) {
                const size_t range_size = range.second - range.first;
                const size_t buf_size = range_size * k_size_row;
                write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size, il);
            }
        }

        if (v_state == 0) {
            for (uint32_t il = 0; il < n_layer; ++il) {
                const uint32_t n_embd_v_gqa = llama_kv_v_row_embd(ctx->model, hparams, il);
                const bool has_v_cache = kv_self.v_l[il] != nullptr && need_kv;

                // Write value type
                const int32_t v_type_i = has_v_cache ? (int32_t) kv_self.v_l[il]->type : -1;
                write(&v_type_i, sizeof(v_type_i));

                // Write row size of value
                const uint64_t v_size_row = has_v_cache ? ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa) : 0;
                write(&v_size_row, sizeof(v_size_row));

                if (!has_v_cache) {
                    continue;
                }

                // Read each range of cells of v_size length each into tmp_buf and write out
                for (const auto & range : cell_ranges) {
                    const size_t range_size = range.second - range.first;
                    const size_t buf_size = range_size * v_size_row;
                    write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size, il);
                }
            }
        }
        else if (v_state == 1) {
            // When v is transposed, we also need the element size and get the element ranges from each row
            const uint32_t kv_size = kv_self.size;
            for (uint32_t il = 0; il < n_layer; ++il) {
                const uint32_t n_embd_v_gqa = llama_kv_v_row_embd(ctx->model, hparams, il);
                const bool has_v_cache = kv_self.v_l[il] != nullptr && need_kv;

                // Write value type
                const int32_t v_type_i = has_v_cache ? (int32_t) kv_self.v_l[il]->type : -1;
                write(&v_type_i, sizeof(v_type_i));

                // Write element size
                const uint32_t v_size_el = has_v_cache ? ggml_type_size(kv_self.v_l[il]->type) : 0;
                write(&v_size_el, sizeof(v_size_el));

                // Write GQA embedding size
                const uint32_t n_embd_v_gqa_write = has_v_cache ? n_embd_v_gqa : 0;
                write(&n_embd_v_gqa_write, sizeof(n_embd_v_gqa_write));

                if (!has_v_cache) {
                    continue;
                }

                // For each row, we get the element values of each cell
                for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
                    // Read each range of cells of v_size_el length each into tmp_buf and write out
                    for (const auto & range : cell_ranges) {
                        const size_t range_size = range.second - range.first;
                        const size_t src_offset = (range.first + j * kv_size) * v_size_el;
                        const size_t buf_size = range_size * v_size_el;
                        write_tensor_data(kv_self.v_l[il], src_offset, buf_size, il);
                    }
                }
            }
        }

        const uint32_t qnext_state = llama_kv_has_qnext_state_storage(kv_self) ? 1 : 0;
        write(&qnext_state, sizeof(qnext_state));

        if (qnext_state != 0) {
            for (uint32_t il = 0; il < n_layer; ++il) {
                const bool has_s_cache = il < kv_self.s_l.size() && kv_self.s_l[il] != nullptr;

                const int32_t s_type_i = has_s_cache ? (int32_t) kv_self.s_l[il]->type : -1;
                write(&s_type_i, sizeof(s_type_i));

                const uint64_t s_size_row = has_s_cache ? ggml_row_size(kv_self.s_l[il]->type, kv_self.s_l[il]->ne[0]) : 0;
                write(&s_size_row, sizeof(s_size_row));

                uint32_t s_rows = 0;
                size_t s_offset = 0;
                if (has_s_cache) {
                    const uint32_t n_slots = (uint32_t) kv_self.s_l[il]->ne[1];
                    if (seq_id == -1) {
                        s_rows = n_slots;
                    } else if (llama_kv_qnext_seq_id_in_range(kv_self, seq_id) && (uint32_t) seq_id < kv_self.size) {
                        llama_seq_id src_seq_id = kv_self.cells[seq_id].src;
                        if (llama_kv_qnext_seq_id_in_range(kv_self, src_seq_id)) {
                            s_rows = 1;
                            s_offset = (size_t) src_seq_id * s_size_row;
                        }
                    }
                }

                write(&s_rows, sizeof(s_rows));

                if (has_s_cache && s_rows > 0) {
                    write_tensor_data(kv_self.s_l[il], s_offset, s_rows * s_size_row, il);
                }
            }
        }
    }

    void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) {
        const struct llama_kv_cache & kv_self = ctx->kv_self;
        std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
        uint32_t cell_count = 0;

        // Count the number of cells with the specified seq_id
        // Find all the ranges of cells with this seq id (or all, when -1)
        uint32_t cell_range_begin = kv_self.size;
        for (uint32_t i = 0; i < kv_self.size; ++i) {
            const auto & cell = kv_self.cells[i];
            if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
                ++cell_count;
                if (cell_range_begin == kv_self.size) {
                    cell_range_begin = i;
                }
            } else {
                if (cell_range_begin != kv_self.size) {
                    cell_ranges.emplace_back(cell_range_begin, i);
                    cell_range_begin = kv_self.size;
                }
            }
        }
        if (cell_range_begin != kv_self.size) {
            cell_ranges.emplace_back(cell_range_begin, kv_self.size);
        }

        // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
        uint32_t cell_count_check = 0;
        for (const auto & range : cell_ranges) {
            cell_count_check += range.second - range.first;
        }
        GGML_ASSERT(cell_count == cell_count_check);

        write(&cell_count, sizeof(cell_count));

        write_kv_cache_meta(kv_self, cell_ranges, seq_id);
        write_kv_cache_data(ctx, cell_ranges, seq_id, flags);
    }
};

struct llama_data_read {
    virtual const uint8_t * read(size_t size) = 0;
    virtual void read_to(void * dst, size_t size) = 0;
    virtual size_t get_size_read() = 0;
    virtual ~llama_data_read() = default;

    void read_string(std::string & str) {
        uint32_t str_size;
        read_to(&str_size, sizeof(str_size));

        str.assign((const char *) read(str_size), str_size);
    }

    // validate model information
    void read_model_info(const struct llama_context * ctx) {
        std::string cur_arch_str = llama_model_arch_name(ctx->model.arch);
        std::string arch_str;
        read_string(arch_str);
        if (cur_arch_str != arch_str) {
            throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
        }
        // TODO: add more info which needs to be identical but which is not verified otherwise
    }

    void read_rng(std::mt19937 & rng) {
        std::string rng_str;
        read_string(rng_str);

        std::istringstream rng_ss(rng_str);
        rng_ss >> rng;

        if (rng_ss.fail()) {
            throw std::runtime_error("failed to load RNG state");
        }
    }

    void read_output_ids(struct llama_context * ctx) {
        std::vector<int32_t> output_pos;

        uint32_t n_outputs;
        read_to(&n_outputs, sizeof(n_outputs));

        if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
            throw std::runtime_error("could not reserve outputs");
        }

        if (n_outputs) {
            output_pos.resize(n_outputs);
            read_to(output_pos.data(), n_outputs * sizeof(int32_t));

            for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
                int32_t id = output_pos[i];
                if ((uint32_t) id >= ctx->cparams.n_batch) {
                    throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
                }
                ctx->output_ids[id] = i;
            }

            ctx->n_outputs = n_outputs;
        }
    }

    void read_logits(struct llama_context * ctx) {
        uint64_t logits_size;
        read_to(&logits_size, sizeof(logits_size));

        if (ctx->logits_size < logits_size) {
            throw std::runtime_error("logits buffer too small");
        }

        if (logits_size) {
            read_to(ctx->logits, logits_size * sizeof(float));
        }
    }

    void read_embeddings(struct llama_context * ctx) {
        uint64_t embeddings_size;
        read_to(&embeddings_size, sizeof(embeddings_size));

        if (ctx->embd_size < embeddings_size) {
            throw std::runtime_error("embeddings buffer too small");
        }

        const uint64_t row_width = llama_output_embd_width(*ctx);
        if (row_width == 0 || (embeddings_size % row_width) != 0) {
            throw std::runtime_error("invalid embeddings payload size");
        }

        ctx->n_outputs_embd = embeddings_size / row_width;

        if (embeddings_size) {
            read_to(ctx->embd, embeddings_size * sizeof(float));
        }
    }

    bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
        struct llama_kv_cache & kv_self = ctx->kv_self;

        if (dest_seq_id != -1) {
            // single sequence

            llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);

            llama_batch batch = llama_batch_init(cell_count, 0, 1);
            batch.n_tokens = cell_count;
            for (uint32_t i = 0; i < cell_count; ++i) {
                llama_pos pos;
                uint32_t n_seq_id;

                read_to(&pos, sizeof(pos));
                read_to(&n_seq_id, sizeof(n_seq_id));

                if (n_seq_id != 0) {
		    llama_batch_free(batch);
                    LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
                    return false;
                }

                batch.pos[i] = pos;
                batch.n_seq_id[i] = 1;
                batch.seq_id[i][0] = dest_seq_id;
            }
            if (!llama_kv_cache_find_slot(kv_self, batch)) {
                llama_batch_free(batch);
                LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
                return false;
            }

            // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
            // Assume that this is one contiguous block of cells
            GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
            GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
            GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
            GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));

            // Cleanup
            llama_batch_free(batch);
        } else {
            // whole KV cache restore

            if (cell_count > kv_self.size) {
                LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
                return false;
            }

            llama_kv_cache_clear(kv_self);

            for (uint32_t i = 0; i < cell_count; ++i) {
                llama_kv_cell & cell = kv_self.cells[i];

                llama_pos pos;
                uint32_t  n_seq_id;

                read_to(&pos,      sizeof(pos));
                read_to(&n_seq_id, sizeof(n_seq_id));

                cell.pos = pos;

                for (uint32_t j = 0; j < n_seq_id; ++j) {
                    llama_seq_id seq_id;
                    read_to(&seq_id, sizeof(seq_id));

                    if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
                        LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
                        return false;
                    }

                    cell.seq_id.insert(seq_id);
                }
            }

            kv_self.head = 0;
            kv_self.used = cell_count;
        }

        return true;
    }

    void read_kv_cache_data_split(llama_context * ctx, ggml_tensor * tensor, const uint8_t * data, size_t head, size_t row_size, int nrows, int il) {
        GGML_ASSERT(il >= 0 && il < int(ctx->model.layers.size()));
        GGML_ASSERT(ggml_internal_get_type_traits(tensor->type).row_meta_size == 0);
        std::vector<uint8_t> aux;
        auto extra = (ggml_split_tensor_t *)tensor->extra;
        if (ctx->model.is_mla_model()) {
            GGML_ASSERT(extra);
            GGML_ASSERT(ctx->cparams.mla_attn == 3);
            for (int id = 0; id < extra->n_device; ++id) {
                auto split = extra->splits[id];
                if (!split) continue;
                GGML_ASSERT(split->type == tensor->type);
                ggml_backend_tensor_set(split, data, head*row_size, nrows*row_size);
            }
            return;
        }
        bool is_recurrent =  ctx->model.hparams.recurrent_layer_arr[il];
        auto kv = is_recurrent ? nullptr : get_kv_cache_split_tensor(tensor, ctx->model.layers[il]);
        auto kv_extra = kv ? (ggml_split_tensor_t *)kv->extra : nullptr;
        GGML_ASSERT(extra && (is_recurrent || kv_extra));
        auto ne = kv ? kv->ne[1] : tensor->ne[0];
        size_t sum_ne = 0;
        size_t sum_split_row_size = 0;
        GGML_ASSERT(row_size == ggml_row_size(tensor->type, ne));
        for (int id = 0; id < extra->n_device; ++id) {
            auto split = extra->splits[id];
            auto kv_split = kv_extra ? kv_extra->splits[id] : nullptr;
            GGML_ASSERT((split && (kv_split || is_recurrent)) || (!split && !kv_split));
            if (!split) continue;
            GGML_ASSERT(split->type == tensor->type);
            auto ne_split = kv_split ? kv_split->ne[1] : split->ne[0];
            auto split_row_size = ggml_row_size(tensor->type, ne_split);
            aux.resize(split_row_size*nrows);
            auto src = data + sum_split_row_size;
            auto dst = aux.data();
            for (int row = 0; row < nrows; ++row) {
                std::memcpy(dst, src, split_row_size);
                dst += split_row_size;
                src += row_size;
            }
            ggml_backend_tensor_set(split, aux.data(), head*split_row_size, nrows*split_row_size);
            sum_ne += ne_split;
            sum_split_row_size += split_row_size;
        }
        GGML_ASSERT(sum_ne == ne);
        GGML_ASSERT(sum_split_row_size == row_size);
    }

    bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) {
        const struct llama_hparams & hparams = ctx->model.hparams;
        struct llama_kv_cache & kv_self = ctx->kv_self;
        bool need_kv = (flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0;
        // v_state: 0 -> not transposed V cache
        //          1 -> transposed V cache
        //          2 -> no V cache (as it may be the case with MLA)
        uint32_t v_state;
        uint32_t n_layer;
        read_to(&v_state, sizeof(v_state));
        read_to(&n_layer, sizeof(n_layer));

        if (n_layer != kv_self.k_l.size()) {
            LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
            return false;
        }
        if (cell_count > kv_self.size) {
            LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
            return false;
        }

	// Currently the only way there is no V cache (and thus v_state is 2) requires flash_attn, and flash_attn sets kv_self.v_trans to false
        if (kv_self.v_trans != (v_state == 1)) {
            LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
            return false;
        }

        // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
        for (uint32_t il = 0; il < n_layer; ++il) {
            const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
            const uint32_t n_embd_head_qk_rope = hparams.n_rot;
            const uint32_t kv_lora_rank = hparams.n_lora_kv;
            const bool has_k_cache = kv_self.k_l[il] != nullptr && need_kv;


            // Read type of key
            int32_t k_type_i_ref;
            read_to(&k_type_i_ref, sizeof(k_type_i_ref));
            if (!has_k_cache) {
                if (k_type_i_ref != -1) {
                    LLAMA_LOG_ERROR("%s: missing key cache for layer %d\n", __func__, il);
                    return false;
                }
            } else {
                const int32_t k_type_i = (int32_t) kv_self.k_l[il]->type;
                if (k_type_i != k_type_i_ref) {
                    LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
                    return false;
                }
            }

            // Read row size of key
            uint64_t k_size_row_ref;
            read_to(&k_size_row_ref, sizeof(k_size_row_ref));
            if (!has_k_cache) {
                if (k_size_row_ref != 0) {
                    LLAMA_LOG_ERROR("%s: expected empty key row size for layer %d\n", __func__, il);
                    return false;
                }
                continue;
            }
            const size_t k_size_row = (ctx->cparams.mla_attn == 0) ? ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa) : ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope);
            if (k_size_row != k_size_row_ref) {
                LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
                return false;
            }

            if (cell_count) {
                // Read and set the keys for the whole cell range
                if (kv_self.k_l[il]->extra) {
                    read_kv_cache_data_split(ctx, kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head, k_size_row, cell_count, il);
                } else {
                    ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
                }
            }
        }

        if (v_state == 0) {
            for (uint32_t il = 0; il < n_layer; ++il) {
                const uint32_t n_embd_v_gqa = llama_kv_v_row_embd(ctx->model, hparams, il);
                const bool has_v_cache = kv_self.v_l[il] != nullptr && need_kv;

                // Read type of value
                int32_t v_type_i_ref;
                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
                if (!has_v_cache) {
                    if (v_type_i_ref != -1) {
                        LLAMA_LOG_ERROR("%s: missing value cache for layer %d\n", __func__, il);
                        return false;
                    }
                } else {
                    const int32_t v_type_i = (int32_t) kv_self.v_l[il]->type;
                    if (v_type_i != v_type_i_ref) {
                        LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                        return false;
                    }
                }

                // Read row size of value
                uint64_t v_size_row_ref;
                read_to(&v_size_row_ref, sizeof(v_size_row_ref));
                if (!has_v_cache) {
                    if (v_size_row_ref != 0) {
                        LLAMA_LOG_ERROR("%s: expected empty value row size for layer %d\n", __func__, il);
                        return false;
                    }
                    continue;
                }
                const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
                if (v_size_row != v_size_row_ref) {
                    LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
                    return false;
                }

                if (cell_count) {
                    // Read and set the values for the whole cell range
                    if (kv_self.v_l[il]->extra) {
                        read_kv_cache_data_split(ctx, kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head, v_size_row, cell_count, il);
                    } else {
                        ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
                    }
                }
            }
        }
        else if (v_state == 1) {
            // For each layer, read the values for each cell (transposed)
            for (uint32_t il = 0; il < n_layer; ++il) {
                const uint32_t n_embd_v_gqa = llama_kv_v_row_embd(ctx->model, hparams, il);
                const bool has_v_cache = kv_self.v_l[il] != nullptr && need_kv;

                // Read type of value
                int32_t v_type_i_ref;
                read_to(&v_type_i_ref, sizeof(v_type_i_ref));
                if (!has_v_cache) {
                    if (v_type_i_ref != -1) {
                        LLAMA_LOG_ERROR("%s: missing transposed value cache for layer %d\n", __func__, il);
                        return false;
                    }
                } else {
                    const int32_t v_type_i = (int32_t) kv_self.v_l[il]->type;
                    if (v_type_i != v_type_i_ref) {
                        LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
                        return false;
                    }
                }

                // Read element size of value
                uint32_t v_size_el_ref;
                read_to(&v_size_el_ref, sizeof(v_size_el_ref));
                if (!has_v_cache) {
                    if (v_size_el_ref != 0) {
                        LLAMA_LOG_ERROR("%s: expected empty transposed value element size for layer %d\n", __func__, il);
                        return false;
                    }
                } else {
                    const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
                    if (v_size_el != v_size_el_ref) {
                        LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
                        return false;
                    }
                }

                // Read GQA embedding size
                uint32_t n_embd_v_gqa_ref;
                read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
                if (!has_v_cache) {
                    if (n_embd_v_gqa_ref != 0) {
                        LLAMA_LOG_ERROR("%s: expected empty transposed value rows for layer %d\n", __func__, il);
                        return false;
                    }
                    continue;
                }
                if (n_embd_v_gqa != n_embd_v_gqa_ref) {
                    LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
                    return false;
                }

                if (cell_count) {
                    const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
                    if (kv_self.v_l[il]->extra) {
                        throw std::runtime_error("Transposed V cache is not sypported with split mode 'graph'");
                    }
                    // For each row in the transposed matrix, read the values for the whole cell range
                    for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
                        const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
                        ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
                    }
                }
            }
        }

        uint32_t qnext_state_ref = 0;
        read_to(&qnext_state_ref, sizeof(qnext_state_ref));

        const bool has_qnext_state = llama_kv_has_qnext_state_storage(kv_self);
        if ((qnext_state_ref != 0) != has_qnext_state) {
            LLAMA_LOG_ERROR("%s: incompatible qwen3next state cache presence\n", __func__);
            return false;
        }

        if (qnext_state_ref != 0) {
            for (uint32_t il = 0; il < n_layer; ++il) {
                const bool has_s_cache = il < kv_self.s_l.size() && kv_self.s_l[il] != nullptr;

                int32_t s_type_i_ref;
                read_to(&s_type_i_ref, sizeof(s_type_i_ref));
                if (!has_s_cache) {
                    if (s_type_i_ref != -1) {
                        LLAMA_LOG_ERROR("%s: missing qwen3next state cache for layer %d\n", __func__, il);
                        return false;
                    }
                } else {
                    const int32_t s_type_i = (int32_t) kv_self.s_l[il]->type;
                    if (s_type_i != s_type_i_ref) {
                        LLAMA_LOG_ERROR("%s: mismatched qwen3next state type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
                        return false;
                    }
                }

                uint64_t s_size_row_ref;
                read_to(&s_size_row_ref, sizeof(s_size_row_ref));

                const uint64_t s_size_row = has_s_cache ? ggml_row_size(kv_self.s_l[il]->type, kv_self.s_l[il]->ne[0]) : 0;
                if (s_size_row != s_size_row_ref) {
                    LLAMA_LOG_ERROR("%s: mismatched qwen3next state row size (%zu != %zu, layer %d)\n",
                            __func__, (size_t) s_size_row, (size_t) s_size_row_ref, il);
                    return false;
                }

                uint32_t s_rows_ref;
                read_to(&s_rows_ref, sizeof(s_rows_ref));

                uint32_t s_rows = 0;
                uint32_t s_dst_row = 0;
                if (has_s_cache) {
                    const uint32_t n_slots = (uint32_t) kv_self.s_l[il]->ne[1];
                    if (seq_id == -1) {
                        s_rows = n_slots;
                    } else if (llama_kv_qnext_seq_id_in_range(kv_self, seq_id)) {
                        s_rows = 1;
                        s_dst_row = (uint32_t) seq_id;
                    }
                }

                if (s_rows_ref != s_rows) {
                    LLAMA_LOG_ERROR("%s: mismatched qwen3next state row count (%u != %u, layer %d)\n", __func__, s_rows, s_rows_ref, il);
                    return false;
                }

                if (s_rows > 0) {
                    const size_t s_data_size = s_rows * s_size_row;
                    const size_t s_dst_offset = (size_t) s_dst_row * s_size_row;
                    if (kv_self.s_l[il]->extra) {
                        read_kv_cache_data_split(ctx, kv_self.s_l[il], read(s_data_size), s_dst_row, s_size_row, s_rows, il);
                    } else {
                        ggml_backend_tensor_set(kv_self.s_l[il], read(s_data_size), s_dst_offset, s_data_size);
                    }
                }
            }
        }
        return true;
    }

    void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) {
        uint32_t cell_count;
        read_to(&cell_count, sizeof(cell_count));

        bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count, seq_id, flags);

        if (!res) {
            if (seq_id == -1) {
                llama_kv_cache_clear(ctx);
            } else {
                llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
            }
            throw std::runtime_error("failed to restore kv cache");
        }
    }
};

struct llama_data_write_dummy : llama_data_write {
    size_t size_written = 0;

    llama_data_write_dummy() {}

    void write(const void * /* src */, size_t size) override {
        size_written += size;
    }

    void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size, int /* il */) override {
        size_written += size;
    }

    size_t get_size_written() override {
        return size_written;
    }
};

struct llama_data_write_buffer : llama_data_write {
    uint8_t * ptr;
    size_t buf_size = 0;
    size_t size_written = 0;

    const llama_model & model;

    std::vector<uint8_t> aux_buffer;

    llama_data_write_buffer(uint8_t * p, size_t len, const llama_model & _model) : ptr(p), buf_size(len), model(_model) {}

    void write(const void * src, size_t size) override {
        if (size > buf_size) {
            throw std::runtime_error("unexpectedly reached end of buffer");
        }
        memcpy(ptr, src, size);
        ptr += size;
        size_written += size;
        buf_size -= size;
    }

    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size, int il) override {
        if (size > buf_size) {
            throw std::runtime_error("unexpectedly reached end of buffer");
        }
        if (tensor->extra) {
            get_tensor_data_split(tensor, offset, size, il);
        } else {
            ggml_backend_tensor_get(tensor, ptr, offset, size);
        }
        ptr += size;
        size_written += size;
        buf_size -= size;
    }

    void get_tensor_data_split(const ggml_tensor * tensor, size_t offset, size_t size, int il) {
        auto tt = ggml_internal_get_type_traits(tensor->type);
        if (tt.row_meta_size > 0) {
            throw std::runtime_error(std::string{"Split cache for type "} + ggml_type_name(tensor->type) + " is not supported");
        }
        GGML_ASSERT(il >= 0 && il < int(model.layers.size()));
        if (model.is_mla_model()) {
            // For MLA models, the cache is replacated on all GPUs when using split mode graph, so it is
            // enough to get the data from the 1st device that has a copy
            auto extra = (const ggml_split_tensor_t *)tensor->extra;
            for (int id = 0; id < extra->n_device; ++id) {
                if (extra->splits[id]) {
                    ggml_backend_tensor_get(extra->splits[id], ptr, offset, size);
                    break;
                }
            }
        }
        else if (model.hparams.recurrent_layer_arr[il]) {
            get_tensor_data_split(ptr, tensor, aux_buffer, offset, size);
        } else {
            auto kv = get_kv_cache_split_tensor(tensor, model.layers[il]);
            get_tensor_data_split(ptr, tensor, kv, aux_buffer, offset, size);
        }
    }

    static void get_tensor_data_split(uint8_t * ptr, const ggml_tensor * tensor, const ggml_tensor * kv,
            std::vector<uint8_t> & aux_buffer, size_t offset, size_t size) {
        auto ne = kv->ne[1];
        auto full_row_size = ggml_row_size(tensor->type, ne);
        GGML_ASSERT(offset % full_row_size == 0);
        GGML_ASSERT(size   % full_row_size == 0);
        auto first_row = offset / full_row_size;
        auto num_rows  = size / full_row_size;
        auto extra = (const ggml_split_tensor_t *)tensor->extra;
        auto kv_extra = (const ggml_split_tensor_t *)kv->extra;
        GGML_ASSERT(extra && kv_extra);
        size_t split_offset = 0;
        size_t total_size   = 0;
        for (int id = 0; id < extra->n_device; ++id) {
            auto split = extra->splits[id];
            auto kv_split = kv_extra->splits[id];
            GGML_ASSERT((split && kv_split) || (!split && !kv_split));
            if (!split) continue;
            GGML_ASSERT(split->type == tensor->type);
            auto split_row_size = ggml_row_size(tensor->type, kv_split->ne[1]);
            auto split_size = split_row_size * num_rows;
            if (split_size > aux_buffer.size()) aux_buffer.resize(split_size);
            ggml_backend_tensor_get(split, aux_buffer.data(), first_row*split_row_size, split_size);
            auto dst = ptr + split_offset;
            auto src = aux_buffer.data();
            for (int row = 0; row < (int)num_rows; ++row) {
                std::memcpy(dst, src, split_row_size);
                dst += full_row_size;
                src += split_row_size;
            }
            split_offset += split_row_size;
            total_size   += split_row_size * num_rows;
        }
        GGML_ASSERT(total_size == size);
    }
    static void get_tensor_data_split(uint8_t * ptr, const ggml_tensor * tensor,
            std::vector<uint8_t> & aux_buffer, size_t offset, size_t size) {
        auto ne = tensor->ne[0];
        auto full_row_size = ggml_row_size(tensor->type, ne);
        GGML_ASSERT(offset % full_row_size == 0);
        GGML_ASSERT(size   % full_row_size == 0);
        auto first_row = offset / full_row_size;
        auto num_rows  = size / full_row_size;
        auto extra = (const ggml_split_tensor_t *)tensor->extra;
        GGML_ASSERT(extra);
        size_t split_offset = 0;
        size_t total_size   = 0;
        for (int id = 0; id < extra->n_device; ++id) {
            auto split = extra->splits[id];
            if (!split) continue;
            GGML_ASSERT(split->type == tensor->type);
            auto split_row_size = ggml_row_size(tensor->type, split->ne[0]);
            auto split_size = split_row_size * num_rows;
            if (split_size > aux_buffer.size()) aux_buffer.resize(split_size);
            ggml_backend_tensor_get(split, aux_buffer.data(), first_row*split_row_size, split_size);
            auto dst = ptr + split_offset;
            auto src = aux_buffer.data();
            for (int row = 0; row < (int)num_rows; ++row) {
                std::memcpy(dst, src, split_row_size);
                dst += full_row_size;
                src += split_row_size;
            }
            split_offset += split_row_size;
            total_size   += split_row_size * num_rows;
        }
        GGML_ASSERT(total_size == size);
    }

    size_t get_size_written() override {
        return size_written;
    }
};

struct llama_data_read_buffer : llama_data_read {
    const uint8_t * ptr;
    size_t buf_size = 0;
    size_t size_read = 0;

    llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}

    const uint8_t * read(size_t size) override {
        const uint8_t * base_ptr = ptr;
        if (size > buf_size) {
            throw std::runtime_error("unexpectedly reached end of buffer");
        }
        ptr += size;
        size_read += size;
        buf_size -= size;
        return base_ptr;
    }

    void read_to(void * dst, size_t size) override {
        memcpy(dst, read(size), size);
    }

    size_t get_size_read() override {
        return size_read;
    }
};

struct llama_data_write_file : llama_data_write {
    llama_file * file;
    size_t size_written = 0;
    std::vector<uint8_t> temp_buffer;
    std::vector<uint8_t> aux_buffer;

    const llama_model & model;

    llama_data_write_file(llama_file * f, const llama_model & _model) : file(f), model(_model) {}

    void write(const void * src, size_t size) override {
        file->write_raw(src, size);
        size_written += size;
    }

    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size, int il) override {
        temp_buffer.resize(size);
        if (tensor->extra) {
            get_tensor_data_split(tensor, offset, size, il);
        } else {
            ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
        }
        write(temp_buffer.data(), temp_buffer.size());
    }

    void get_tensor_data_split(const struct ggml_tensor * tensor, size_t offset, size_t size, int il) {
        GGML_ASSERT(il >= 0 && il < int(model.layers.size()));
        if (model.is_mla_model()) {
            // MLA models have the cache replicated on all devices. Hence, it is enough to get it
            // from the 1st device that has it.
            auto extra = (ggml_split_tensor_t *)tensor->extra;
            for (int id = 0; id < extra->n_device; ++id) {
                if (extra->splits[id]) {
                    ggml_backend_tensor_get(extra->splits[id], temp_buffer.data(), offset, size);
                    break;
                }
            }
            return;
        }
        auto kv = get_kv_cache_split_tensor(tensor, model.layers[il]);
        temp_buffer.resize(size);
        llama_data_write_buffer::get_tensor_data_split(temp_buffer.data(), tensor, kv, aux_buffer, offset, size);
    }

    size_t get_size_written() override {
        return size_written;
    }
};

struct llama_data_read_file : llama_data_read {
    llama_file * file;
    size_t size_read = 0;
    std::vector<uint8_t> temp_buffer;

    llama_data_read_file(llama_file * f) : file(f) {}

    void read_to(void * dst, size_t size) override {
        file->read_raw(dst, size);
        size_read += size;
    }

    const uint8_t * read(size_t size) override {
        temp_buffer.resize(size);
        read_to(temp_buffer.data(), size);
        return temp_buffer.data();
    }

    size_t get_size_read() override {
        return size_read;
    }
};

/** copy state data into either a buffer or file depending on the passed in context
 *
 * file context:
 * llama_file file("/path", "wb");
 * llama_data_write_file data_ctx(&file);
 * llama_state_get_data_internal(ctx, data_ctx);
 *
 * buffer context:
 * std::vector<uint8_t> buf(max_size, 0);
 * llama_data_write_buffer data_ctx(buf.data(), max_size);
 * llama_state_get_data_internal(ctx, data_ctx);
 *
*/
static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
    llama_synchronize(ctx);

    data_ctx.write_model_info(ctx);

    data_ctx.write_rng(ctx->sampling.rng);

    // copy outputs
    data_ctx.write_output_ids(ctx);
    data_ctx.write_logits(ctx);
    data_ctx.write_embeddings(ctx);

    data_ctx.write_kv_cache(ctx);

    return data_ctx.get_size_written();
}

size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
    llama_data_write_buffer data_ctx(dst, size, ctx->model);
    try {
        return llama_state_get_data_internal(ctx, data_ctx);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
        return 0;
    }
}

// Returns the *actual* size of the state.
// Intended to be used when saving to state to a buffer.
size_t llama_state_get_size(struct llama_context * ctx) {
    llama_data_write_dummy data_ctx;
    try {
        return llama_state_get_data_internal(ctx, data_ctx);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
        return 0;
    }
}

static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
    llama_synchronize(ctx);

    data_ctx.read_model_info(ctx);

    // set rng
    data_ctx.read_rng(ctx->sampling.rng);

    // set outputs
    data_ctx.read_output_ids(ctx);
    data_ctx.read_logits(ctx);
    data_ctx.read_embeddings(ctx);

    data_ctx.read_kv_cache(ctx);

    return data_ctx.get_size_read();
}

// Sets the state reading from the specified source address
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
    llama_data_read_buffer data_ctx(src, size);
    try {
        return llama_state_set_data_internal(ctx, data_ctx);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
        return 0;
    }
}

static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
    llama_file file(path_session, "rb");

    // sanity checks
    {
        const uint32_t magic   = file.read_u32();
        const uint32_t version = file.read_u32();

        if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
            LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
            return false;
        }
    }

    // load the prompt
    {
        const uint32_t n_token_count = file.read_u32();

        if (n_token_count > n_token_capacity) {
            LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
            return false;
        }

        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
        *n_token_count_out = n_token_count;
    }

    // restore the context state
    {
        const size_t n_state_size_cur = file.size() - file.tell();

        llama_data_read_file data_ctx(&file);
        const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);

        if (n_read != n_state_size_cur) {
            LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
            return false;
        }
    }
    return true;
}

bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
    try {
        return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
        return false;
    }
}

static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
    llama_file file(path_session, "wb");

    file.write_u32(LLAMA_SESSION_MAGIC);
    file.write_u32(LLAMA_SESSION_VERSION);

    // save the prompt
    file.write_u32((uint32_t) n_token_count);
    file.write_raw(tokens, sizeof(llama_token) * n_token_count);

    // save the context state using stream saving
    llama_data_write_file data_ctx(&file, ctx->model);
    llama_state_get_data_internal(ctx, data_ctx);

    return true;
}

bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
    try {
        return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
        return false;
    }
}

static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
    llama_synchronize(ctx);

    data_ctx.write_kv_cache(ctx, seq_id, flags);

    return data_ctx.get_size_written();
}

size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
    llama_data_write_dummy data_ctx;
    return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id, flags);
}

size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
    llama_data_write_buffer data_ctx(dst, size, ctx->model);
    try {
        return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id, flags);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
        return 0;
    }
}

static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id, llama_state_seq_flags flags) {
    llama_synchronize(ctx);

    data_ctx.read_kv_cache(ctx, dest_seq_id, flags);

    return data_ctx.get_size_read();
}

size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id, llama_state_seq_flags flags) {
    llama_data_read_buffer data_ctx(src, size);
    try {
        return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id, flags);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
        return 0;
    }
}

static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
    llama_file file(filepath, "wb");

    file.write_u32(LLAMA_STATE_SEQ_MAGIC);
    file.write_u32(LLAMA_STATE_SEQ_VERSION);

    // save the prompt
    file.write_u32((uint32_t) n_token_count);
    file.write_raw(tokens, sizeof(llama_token) * n_token_count);

    // save the context state using stream saving
    llama_data_write_file data_ctx(&file, ctx->model);
    llama_state_seq_get_data_internal(ctx, data_ctx, seq_id, 0);

    const size_t res = file.tell();
    GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
    return res;
}

static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
    llama_file file(filepath, "rb");

    // version checks
    {
        const uint32_t magic   = file.read_u32();
        const uint32_t version = file.read_u32();

        if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
            LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
            return 0;
        }
    }

    // load the prompt
    {
        const uint32_t n_token_count = file.read_u32();

        if (n_token_count > n_token_capacity) {
            LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
            return 0;
        }

        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
        *n_token_count_out = n_token_count;
    }

    // restore the context state
    {
        const size_t state_size = file.size() - file.tell();
        llama_data_read_file data_ctx(&file);
        const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id, 0);
        if (!nread) {
            LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
            return 0;
        }
        GGML_ASSERT(nread <= state_size);
        GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
    }

    return file.tell();
}

size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
    try {
        return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
        return 0;
    }
}

size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
    try {
        return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
        return 0;
    }
}

void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
    ctx->cparams.n_threads       = n_threads;
    ctx->cparams.n_threads_batch = n_threads_batch;
}

uint32_t llama_n_threads(struct llama_context * ctx) {
    return ctx->cparams.n_threads;
}

uint32_t llama_n_threads_batch(struct llama_context * ctx) {
    return ctx->cparams.n_threads_batch;
}

void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
    ctx->abort_callback      = abort_callback;
    ctx->abort_callback_data = abort_callback_data;
}

void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
    ctx->cparams.embeddings = embeddings;
}

void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
    ctx->cparams.causal_attn = causal_attn;
}

struct llama_batch llama_batch_get_one(
             llama_token * tokens,
                 int32_t   n_tokens,
               llama_pos   pos_0,
            llama_seq_id   seq_id) {
    return {
        /*n_tokens       =*/ n_tokens,
        /*tokens         =*/ tokens,
        /*embd           =*/ nullptr,
        /*pos            =*/ nullptr,
        /*n_seq_id       =*/ nullptr,
        /*seq_id         =*/ nullptr,
        /*logits         =*/ nullptr,
        /*all_pos_0      =*/ pos_0,
        /*all_pos_1      =*/ 1,
        /*all_seq_id     =*/ seq_id,
    };
}

struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };

    if (embd) {
        batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
    } else {
        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
    }

    batch.pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens_alloc);
    batch.n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens_alloc);
    batch.seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
    for (int i = 0; i < n_tokens_alloc; ++i) {
        batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
    }
    batch.seq_id[n_tokens_alloc] = nullptr;

    batch.logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens_alloc);

    return batch;
}

void llama_batch_free(struct llama_batch batch) {
    if (batch.token)    free(batch.token);
    if (batch.embd)     free(batch.embd);
    if (batch.pos)      free(batch.pos);
    if (batch.n_seq_id) free(batch.n_seq_id);
    if (batch.seq_id) {
        for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
            free(batch.seq_id[i]);
        }
        free(batch.seq_id);
    }
    if (batch.logits)   free(batch.logits);
}

int32_t llama_encode(
        struct llama_context * ctx,
          struct llama_batch   batch) {
    const int ret = llama_encode_internal(*ctx, batch);
    if (ret < 0) {
        LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
    }

    return ret;
}

int32_t llama_decode(
        struct llama_context * ctx,
          struct llama_batch   batch) {
    const int ret = llama_decode_internal(*ctx, batch);
    if (ret < 0) {
        LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
    }

    return ret;
}

void llama_set_mtp_op_type(llama_context * ctx, llama_mtp_op_type mtp_op_type) {
    ctx->set_mtp_op_type(mtp_op_type);
}

void llama_synchronize(struct llama_context * ctx) {
    ggml_backend_sched_synchronize(ctx->sched);

    // FIXME: if multiple single tokens are evaluated without a synchronization,
    // the stats will be added to the prompt evaluation stats
    // this should only happen when using batch size 1 to evaluate a batch

    // add the evaluation to the stats
    if (ctx->n_queued_tokens == 1) {
        ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
        ctx->n_eval++;
    } else if (ctx->n_queued_tokens > 1) {
        ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
        ctx->n_p_eval += ctx->n_queued_tokens;
    }

    // get a more accurate load time, upon first eval
    if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
        ctx->has_evaluated_once = true;
    }

    ctx->n_queued_tokens = 0;
    ctx->t_compute_start_us = 0;
}

float * llama_get_logits(struct llama_context * ctx) {
    llama_synchronize(ctx);

    return ctx->logits;
}

float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
    int32_t j = -1;
    llama_synchronize(ctx);

    try {
        if (ctx->logits == nullptr) {
            throw std::runtime_error("no logits");
        }

        if (i < 0) {
            j = ctx->n_outputs + i;
            if (j < 0) {
                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
            }
        } else if ((size_t) i >= ctx->output_ids.size()) {
            throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
        } else {
            j = ctx->output_ids[i];
        }

        if (j < 0) {
            throw std::runtime_error(format("batch.logits[%d] != true", i));
        }
        if (j >= ctx->n_outputs) {
            // This should not happen
            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
        }

        return ctx->logits + j*ctx->model.hparams.n_vocab;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
        GGML_ABORT("fatal error");
#endif
        return nullptr;
    }
}

float * llama_get_embeddings(struct llama_context * ctx) {
    llama_synchronize(ctx);

    return ctx->embd;
}

float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
    int32_t j = -1;

    llama_synchronize(ctx);

    try {
        if (ctx->embd == nullptr) {
            throw std::runtime_error("no embeddings");
        }

        if (i < 0) {
            j = ctx->n_outputs_embd + i;
            if (j < 0) {
                throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs_embd));
            }
        } else if ((size_t) i >= ctx->output_ids.size()) {
            throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
        } else {
            j = ctx->output_ids[i];
        }

        if (j < 0) {
            throw std::runtime_error(format("batch.logits[%d] != true", i));
        }
        if (j >= ctx->n_outputs_embd) {
            // This should not happen
            throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs_embd=%d)", j, ctx->n_outputs_embd));
        }

        return ctx->embd + (size_t) j * llama_output_embd_width(*ctx);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
        GGML_ABORT("fatal error");
#endif
        return nullptr;
    }
}

float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
    llama_synchronize(ctx);

    auto it = ctx->embd_seq.find(seq_id);
    if (it == ctx->embd_seq.end()) {
        return nullptr;
    }

    return it->second.data();
}

//
// vocab
//

const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
    return model->vocab.token_get_text(token);
}

float llama_token_get_score(const struct llama_model * model, llama_token token) {
    return model->vocab.token_get_score(token);
}

enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
    return model->vocab.token_get_attr(token);
}

bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
    return model->vocab.is_eog(token);
}

bool llama_token_is_control(const struct llama_model * model, llama_token token) {
    return model->vocab.is_control(token);
}

llama_token llama_token_bos(const struct llama_model * model) {
    return model->vocab.token_bos();
}

llama_token llama_token_eos(const struct llama_model * model) {
    return model->vocab.token_eos();
}

llama_token llama_token_sep(const struct llama_model * model) {
    return model->vocab.token_sep();
}

llama_token llama_token_nl (const struct llama_model * model) {
    return model->vocab.token_nl();
}

llama_token llama_token_pad(const struct llama_model * model) {
    return model->vocab.token_pad();
}

int32_t llama_add_bos_token(const struct llama_model * model) {
    return model->vocab.get_add_bos();
}

int32_t llama_add_eos_token(const struct llama_model * model) {
    return model->vocab.get_add_eos();
}

llama_token llama_token_prefix(const struct llama_model * model) {
    return model->vocab.token_prefix();
}

llama_token llama_token_middle(const struct llama_model * model) {
    return model->vocab.token_middle();
}

llama_token llama_token_suffix(const struct llama_model * model) {
    return model->vocab.token_suffix();
}

llama_token llama_token_eot(const struct llama_model * model) {
    return model->vocab.token_eot();
}

//
// tokenization
//

int32_t llama_tokenize(
    const struct llama_model * model,
                  const char * text,
                     int32_t   text_len,
                 llama_token * tokens,
                     int32_t   n_tokens_max,
                        bool   add_special,
                        bool   parse_special) {
    return model->vocab.tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
}

int32_t llama_token_to_piece(
    const struct llama_model * model,
                 llama_token   token,
                        char * buf,
                     int32_t   length,
                     int32_t   lstrip,
                        bool   special) {
    return model->vocab.token_to_piece(token, buf, length, lstrip, special);
}


int32_t llama_detokenize(
    const struct llama_vocab * vocab,
    const llama_token * tokens,
    int32_t   n_tokens,
    char * text,
    int32_t   text_len_max,
    bool   remove_special,
    bool   unparse_special) {
    return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
}

//
// chat templates
//

static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
    if (auto it = LLM_CHAT_TEMPLATES.find(tmpl); it != LLM_CHAT_TEMPLATES.end()) {
        return it->second;
    }
    auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
        return tmpl.find(haystack) != std::string::npos;
    };
    if (tmpl_contains("<|im_start|>")) {
        return LLM_CHAT_TEMPLATE_CHATML;
    } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
        if (tmpl_contains("[SYSTEM_PROMPT]")) {
            return LLM_CHAT_TEMPLATE_MISTRAL_V7;
        } else if (
            // catches official 'v1' template
            tmpl_contains("' [INST] ' + system_message")
            // catches official 'v3' and 'v3-tekken' templates
            || tmpl_contains("[AVAILABLE_TOOLS]")
        ) {
            // Official mistral 'v1', 'v3' and 'v3-tekken' templates
            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
            // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
            if (tmpl_contains(" [INST]")) {
                return LLM_CHAT_TEMPLATE_MISTRAL_V1;
            } else if (tmpl_contains("\"[INST]\"")) {
                return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
            }
            return LLM_CHAT_TEMPLATE_MISTRAL_V3;
        } else {
            // llama2 template and its variants
            // [variant] support system message
            // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
            bool support_system_message = tmpl_contains("<<SYS>>");
            bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
            bool strip_message = tmpl_contains("content.strip()");
            if (strip_message) {
                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
            } else if (add_bos_inside_history) {
                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
            } else if (support_system_message) {
                return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
            } else {
                return LLM_CHAT_TEMPLATE_LLAMA_2;
            }
        }
    } else if (tmpl_contains("[gMASK]sop")) {
        // chatglm3-6b
        return LLM_CHAT_TEMPLATE_CHATGLM_3;
    } else if (tmpl_contains("[gMASK]<sop>")) {
        return LLM_CHAT_TEMPLATE_CHATGLM_4;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
        return LLM_CHAT_TEMPLATE_PHI_3;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
        return LLM_CHAT_TEMPLATE_FALCON_3;
    } else if (tmpl == "falcon_e" && (tmpl_contains("assistant") && tmpl_contains("user"))) {
        return LLM_CHAT_TEMPLATE_FALCON_E;
    } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
        return LLM_CHAT_TEMPLATE_ZEPHYR;
    } else if (tmpl_contains("bos_token + message['role']")) {
        return LLM_CHAT_TEMPLATE_MONARCH;
    } else if (tmpl_contains("<start_of_turn>")) {
        return LLM_CHAT_TEMPLATE_GEMMA;
    } else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
        // OrionStarAI/Orion-14B-Chat
        return LLM_CHAT_TEMPLATE_ORION;
    } else if (tmpl_contains("GPT4 Correct ")) {
        // openchat/openchat-3.5-0106
        return LLM_CHAT_TEMPLATE_OPENCHAT;
    } else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
        // eachadea/vicuna-13b-1.1 (and Orca variant)
        if (tmpl_contains("SYSTEM: ")) {
            return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
        }
        return LLM_CHAT_TEMPLATE_VICUNA;
    } else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
        // deepseek-ai/deepseek-coder-33b-instruct
        return LLM_CHAT_TEMPLATE_DEEPSEEK;
    } else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
        // CohereForAI/c4ai-command-r-plus
        return LLM_CHAT_TEMPLATE_COMMAND_R;
    } else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
        return LLM_CHAT_TEMPLATE_LLAMA_3;
    } else if (tmpl_contains(LU8("<用户>"))) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        return LLM_CHAT_TEMPLATE_MINICPM;
    } else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
        return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
    } else if (tmpl_contains(LU8("<｜Assistant｜>")) && tmpl_contains(LU8("<｜User｜>")) && tmpl_contains(LU8("<｜end▁of▁sentence｜>"))) {
         // original: if (tmpl_contains(LU8("'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'"))) {
        return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
    } else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
        // EXAONE-3.0-7.8B-Instruct
        return LLM_CHAT_TEMPLATE_EXAONE_3;
    } else if (tmpl_contains("rwkv-world")) {
        return LLM_CHAT_TEMPLATE_RWKV_WORLD;
    } else if (tmpl_contains("<|start_of_role|>")) {
        return LLM_CHAT_TEMPLATE_GRANITE;
    } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
        return LLM_CHAT_TEMPLATE_GIGACHAT;
    } else if (tmpl_contains("<|role_start|>")) {
        return LLM_CHAT_TEMPLATE_MEGREZ;
    } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
        return LLM_CHAT_TEMPLATE_BAILING;
    } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("\"HUMAN\"") && tmpl_contains("<think>")) {
        return LLM_CHAT_TEMPLATE_BAILING_THINK;
    } else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("<role>HUMAN</role>") && tmpl_contains("<|role_end|>")) {
        return LLM_CHAT_TEMPLATE_BAILING2;
    } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
        return LLM_CHAT_TEMPLATE_LLAMA4;
    } else if (tmpl_contains("<|endofuserprompt|>")) {
        return LLM_CHAT_TEMPLATE_DOTS1;
    } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
        return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
    } else if (tmpl_contains("<|im_middle|>") && tmpl_contains("<|im_end|>")) {
        return LLM_CHAT_TEMPLATE_KIMI_K2;
    } else if (tmpl_contains("'Assistant: '  + message['content'] + '<|separator|>")) {
        return LLM_CHAT_TEMPLATE_GROK_2;
    } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
        return LLM_CHAT_TEMPLATE_OPENAI_MOE;
    } else if (tmpl_contains("<seed:bos>")) {
        return LLM_CHAT_TEMPLATE_SEED_OSS;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
}

static int32_t llama_chat_apply_template_internal(
    const llm_chat_template tmpl,
    const std::vector<const llama_chat_message *> & chat,
    std::string & dest, bool add_ass) {
    // Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
    std::stringstream ss;
    if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
        // chatml template
        for (auto message : chat) {
            ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
        }
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
        // Official mistral 'v7' template
        // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
        for (auto message : chat) {
            std::string role(message->role);
            std::string content(message->content);
            if (role == "system") {
                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
            } else if (role == "user") {
                ss << "[INST] " << content << "[/INST]";
            }
            else {
                ss << " " << content << "</s>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
            || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
            || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
        // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
        // See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
        std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
        std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
        bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
        bool is_inside_turn = false;
        for (auto message : chat) {
            if (!is_inside_turn) {
                ss << leading_space << "[INST]" << trailing_space;
                is_inside_turn = true;
            }
            std::string role(message->role);
            std::string content(message->content);
            if (role == "system") {
                ss << content << "\n\n";
            } else if (role == "user") {
                ss << content << leading_space << "[/INST]";
            } else {
                ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
                is_inside_turn = false;
            }
        }
    } else if (
            tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
            || tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
        // llama2 template and its variants
        // [variant] support system message
        // See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
        bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
        // [variant] add BOS inside history
        bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
        // [variant] trim spaces from the input message
        bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
        // construct the prompt
        bool is_inside_turn = true; // skip BOS at the beginning
        ss << "[INST] ";
        for (auto message : chat) {
            std::string content = strip_message ? trim(message->content) : message->content;
            std::string role(message->role);
            if (!is_inside_turn) {
                is_inside_turn = true;
                ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
            }
            if (role == "system") {
                if (support_system_message) {
                    ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
                } else {
                    // if the model does not support system message, we still include it in the first message, but without <<SYS>>
                    ss << content << "\n";
                }
            } else if (role == "user") {
                ss << content << " [/INST]";
            } else {
                ss << content << "</s>";
                is_inside_turn = false;
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
        // Phi 3
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
        }
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
        // Falcon 3
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>\n" << message->content << "\n";
        }
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_E) {
        // Falcon Edge
        for (auto message : chat) {
            std::string role(message->role);
            ss << role << message->content << "\n";
        }
        if (add_ass) {
            ss << "assistant\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
        // zephyr template
        for (auto message : chat) {
            ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
        }
        if (add_ass) {
            ss << "<|assistant|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
        // mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
        for (auto message : chat) {
            std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
            ss << bos << message->role << "\n" << message->content << "</s>\n";
        }
        if (add_ass) {
            ss << "<s>assistant\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
        // google/gemma-7b-it
        std::string system_prompt = "";
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
                system_prompt = trim(message->content);
                continue;
            }
            // in gemma, "assistant" is "model"
            role = role == "assistant" ? "model" : message->role;
            ss << "<start_of_turn>" << role << "\n";
            if (!system_prompt.empty() && role != "model") {
                ss << system_prompt << "\n\n";
                system_prompt = "";
            }
            ss << trim(message->content) << "<end_of_turn>\n";
        }
        if (add_ass) {
            ss << "<start_of_turn>model\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
        // OrionStarAI/Orion-14B-Chat
        std::string system_prompt = "";
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                // there is no system message support, we will merge it with user prompt
                system_prompt = message->content;
                continue;
            } else if (role == "user") {
                ss << "Human: ";
                if (!system_prompt.empty()) {
                    ss << system_prompt << "\n\n";
                    system_prompt = "";
                }
                ss << message->content << "\n\nAssistant: </s>";
            } else {
                ss << message->content << "</s>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
        // openchat/openchat-3.5-0106,
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << message->content << "<|end_of_turn|>";
            } else {
                role[0] = toupper(role[0]);
                ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
            }
        }
        if (add_ass) {
            ss << "GPT4 Correct Assistant:";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
        // eachadea/vicuna-13b-1.1 (and Orca variant)
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                // Orca-Vicuna variant uses a system prefix
                if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
                    ss << "SYSTEM: " << message->content << "\n";
                } else {
                    ss << message->content << "\n\n";
                }
            } else if (role == "user") {
                ss << "USER: " << message->content << "\n";
            } else if (role == "assistant") {
                ss << "ASSISTANT: " << message->content << "</s>\n";
            }
        }
        if (add_ass) {
            ss << "ASSISTANT:";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
        // deepseek-ai/deepseek-coder-33b-instruct
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << message->content;
            } else if (role == "user") {
                ss << "### Instruction:\n" << message->content << "\n";
            } else if (role == "assistant") {
                ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
            }
        }
        if (add_ass) {
            ss << "### Response:\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
        // CohereForAI/c4ai-command-r-plus
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
            } else if (role == "user") {
                ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
            } else if (role == "assistant") {
                ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
            }
        }
        if (add_ass) {
            ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
        // Llama 3
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
        }
        if (add_ass) {
            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
        // chatglm3-6b
        ss << "[gMASK]" << "sop";
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>" << "\n " << message->content;
        }
        if (add_ass) {
            ss << "<|assistant|>";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
        ss << "[gMASK]" << "<sop>";
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>" << "\n" << message->content;
        }
        if (add_ass) {
            ss << "<|assistant|>";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "user") {
                ss << LU8("<用户>");
                ss << trim(message->content);
                ss << "<AI>";
            } else {
                ss << trim(message->content);
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
        // DeepSeek-V2
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << message->content << "\n\n";
            } else if (role == "user") {
                ss << "User: " << message->content << "\n\n";
            } else if (role == "assistant") {
                ss << "Assistant: " << message->content << LU8("<｜end▁of▁sentence｜>");
            }
        }
        if (add_ass) {
            ss << "Assistant:";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
        // DeepSeek-V3
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << message->content << "\n\n";
            } else if (role == "user") {
                ss << LU8("<｜User｜>") << message->content;
            } else if (role == "assistant") {
                ss << LU8("<｜Assistant｜>") << message->content << LU8("<｜end▁of▁sentence｜>");
            }
        }
        if (add_ass) {
            ss << LU8("<｜Assistant｜>");
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
        // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
        // EXAONE-3.0-7.8B-Instruct
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
            } else if (role == "user") {
                ss << "[|user|]" << trim(message->content) << "\n";
            } else if (role == "assistant") {
                ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
            }
        }
        if (add_ass) {
            ss << "[|assistant|]";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
        // this template requires the model to have "\n\n" as EOT token
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "user") {
                ss << "User: " << message->content << "\n\nAssistant:";
            } else {
                ss << message->content << "\n\n";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
        // IBM Granite template
        for (const auto & message : chat) {
            std::string role(message->role);
            ss << "<|start_of_role|>" << role << "<|end_of_role|>";
            if (role == "assistant_tool_call") {
                ss << "<|tool_call|>";
            }
            ss << message->content << "<|end_of_text|>\n";
        }
        if (add_ass) {
            ss << "<|start_of_role|>assistant<|end_of_role|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
        // GigaChat template
        bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";

        // Handle system message if present
        if (has_system) {
            ss << "<s>" << chat[0]->content << "<|message_sep|>";
        } else {
            ss << "<s>";
        }

        // Process remaining messages
        for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
            std::string role(chat[i]->role);
            if (role == "user") {
                ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
                << "available functions<|role_sep|>[]<|message_sep|>";
            } else if (role == "assistant") {
                ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
            }
        }

        // Add generation prompt if needed
        if (add_ass) {
            ss << "assistant<|role_sep|>";
        }
    }  else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
        // Megrez template
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
        }

        if (add_ass) {
            ss << "<|role_start|>assistant<|role_end|>";
        }
    }  else if (tmpl == LLM_CHAT_TEMPLATE_BAILING || tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
        // Bailing (Ling/Ring) template
        for (auto message : chat) {
            std::string role(message->role);

            if (role == "user") {
                role = "HUMAN";
            } else {
                std::transform(role.begin(), role.end(), role.begin(), ::toupper);
            }

            ss << "<role>" << role << "</role>" << message->content;
        }

        if (add_ass) {
            ss << "<role>ASSISTANT</role>";

            if (tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
                ss << "<think>";
            }
        }
    }  else if (tmpl == LLM_CHAT_TEMPLATE_BAILING2) {
        // Bailing2 (Ling 2.0) template
        bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";

        if (!has_system) {
            ss << "<role>SYSTEM</role>detailed thinking off<|role_end|>";
        }

        for (auto message : chat) {
            std::string role(message->role);

            if (role == "user") {
                role = "HUMAN";
            } else {
                std::transform(role.begin(), role.end(), role.begin(), ::toupper);
            }

            ss << "<role>" << role << "</role>" << message->content << "<|role_end|>";
        }
        if (add_ass) {
            ss << "<role>ASSISTANT</role>";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
        // Llama 4
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
        }
        if (add_ass) {
            ss << "<|header_start|>assistant<|header_end|>\n\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_BITNET) {
        // bitnet-25
        std::string system_prompt = "";
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "System: ";
                ss << message->content;
            } else if (role == "user") {
                ss << "User: ";
                if (!system_prompt.empty()) {
                    ss << system_prompt;
                    system_prompt = "";
                }
                ss << message->content << "<|eot_id|>Assistant: ";
            } else {
                ss << message->content;
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
        // dots.llm1.inst (DOTS1)
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "<|system|>" << message->content << "<|endofsystem|>";
            } else if (role == "user") {
                ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
            } else {
                ss << "<|response|>" << message->content << "<|endofresponse|>";
            }
        }
        if (add_ass) {
            ss << "<|response|>";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) {
        // tencent/Hunyuan-A13B-Instruct
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "<|startoftext|>" << message->content << "<|extra_4|>";
            } else if (role == "assistant") {
                ss << "<|startoftext|>" << message->content << "<|eos|>";
            } else {
                ss << "<|startoftext|>" << message->content << "<|extra_0|>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
        // moonshotai/Kimi-K2-Instruct
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "<|im_system|>system<|im_middle|>" << message->content << "<|im_end|>";
            } else if (role == "assistant") {
                ss << "<|im_user|>user<|im_middle|>" << message->content << "<|im_end|>";
            } else {
                ss << "<|im_assistant|>assistant<|im_middle|>" << message->content << "<|im_end|>";
            }
        }
        if (add_ass) {
            ss << "<|im_assistant|>assistant<|im_middle|>";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
        // OpenAI MoE (based on Harmony chat template)
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|start|>" << role << "<|message|>" << message->content;
            ss << (role == "assistant" ? "<|return|>" : "<|end|>");
        }
        if (add_ass) {
            ss << "<|start|>assistant";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "System: " << trim(message->content) << "<|separator|>\n\n";
            }
            else if (role == "user") {
                ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
            }
            else if (role == "assistant") {
                ss << "Assistant: " << message->content << "<|separator|>\n\n";
            }
        }
        if (add_ass) {
            ss << "Assistant:";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) {
        for (auto message: chat) {
            std::string role(message->role);
            ss << "<seed:bos>" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << "<seed:eos>";
        }
        if (add_ass) {
            ss << "<seed:bos>assistant\n";
        }
    } else {
        // template not supported
        return -1;
    }
    dest = ss.str();
    return dest.size();
}

int32_t llama_chat_apply_template(
                              const char * tmpl,
         const struct llama_chat_message * chat,
                                  size_t   n_msg,
                                    bool   add_ass,
                                    char * buf,
                                 int32_t   length) {
    std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
    //if (tmpl == nullptr) {
    //    GGML_ASSERT(model != nullptr);

    //    // load template from model, if available
    //    const auto & it = model->gguf_kv.find("tokenizer.chat_template");
    //    if (it != model->gguf_kv.end() && it->second.size() > 0) {
    //        curr_tmpl = it->second;
    //    }
    //    else {
    //        // worst case: there is no information about template, we will use chatml by default
    //        curr_tmpl = "chatml";  // see llama_chat_apply_template_internal
    //    }
    //}

    // format the chat to string
    std::vector<const llama_chat_message *> chat_vec;
    chat_vec.resize(n_msg);
    for (size_t i = 0; i < n_msg; i++) {
        chat_vec[i] = &chat[i];
    }

    std::string formatted_chat;
    llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
    if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
        return -1;
    }
    int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
    if (res < 0) {
        return res;
    }
    if (buf && length > 0) {
        strncpy(buf, formatted_chat.c_str(), length);
    }
    return res;
}

int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
    auto it = LLM_CHAT_TEMPLATES.begin();
    for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
        output[i] = it->first.c_str();
        std::advance(it, 1);
    }
    return (int32_t) LLM_CHAT_TEMPLATES.size();
}
//
// grammar
//

void llama_grammar_free(struct llama_grammar * grammar) {
    llama_grammar_free_impl(grammar);
}


struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
    return llama_grammar_clone_impl(*grammar);
}

void llama_grammar_apply(
      const struct llama_grammar * grammar,
      const struct llama_context * ctx,
          llama_token_data_array * candidates) {
    llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
}

void llama_sample_grammar(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
      const struct llama_grammar * grammar) {
    llama_grammar_apply(grammar, ctx, candidates);
}

void llama_grammar_accept_token(
            struct llama_grammar * grammar,
            struct llama_context * ctx,
                     llama_token   token) {
    llama_grammar_accept_impl(*grammar, &ctx->model.vocab, &ctx->sampling, token);
}

//
// sampling
//

void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
    llama_set_rng_seed_impl(&ctx->sampling, seed);
}

void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
    llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates, /* normalize */ true);
}

void llama_sample_dist(struct llama_context * ctx, llama_token_data_array * candidates) {
    llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates,  /* normalize */ false);

}
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
    llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
}

void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
    llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
}

void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
    llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
}

void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
    llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
}

void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
    llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
}

void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
    llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
}

void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
    llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
}

void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates_p,
                           float   probability, float threshold, size_t min_keep) {
    llama_sample_xtc_impl(ctx ? &ctx->sampling : nullptr, candidates_p, probability, threshold, min_keep);
}

void llama_sample_top_n_sigma(struct llama_context * ctx, llama_token_data_array * candidates_p, float top_n_sigma) {
    llama_sample_top_n_sigma_impl(ctx ? &ctx->sampling : nullptr, candidates_p, top_n_sigma);
}


void llama_sample_dry([[maybe_unused]] struct llama_context* ctx, struct llama_sampler_dry* smpl,  llama_token_data_array* candidates_p) {
    llama_sampler_dry_apply(smpl, candidates_p);
}

void llama_sample_adaptive_p(llama_context * ctx,
                   llama_token_data_array  * candidates,
                 llama_sampler_adaptive_p  * adapt_p_ctx) {
    llama_sample_adaptive_p_impl(&ctx->sampling, candidates, adapt_p_ctx);
}

void llama_prep_adaptive_p(struct llama_context * ctx, llama_token_data_array * candidates, struct llama_sampler_adaptive_p * adapt_p_ctx) {
    llama_prep_adaptive_p_impl(&ctx->sampling, candidates, adapt_p_ctx);
}


void llama_sample_repetition_penalties(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
               const llama_token * last_tokens,
                          size_t   penalty_last_n,
                           float   penalty_repeat,
                           float   penalty_freq,
                           float   penalty_present) {
    llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
}

void llama_sample_apply_guidance(
          struct llama_context * ctx,
                         float * logits,
                         float * logits_guidance,
                         float   scale) {
    llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
}

llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
    return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
}

llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
    return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
}

llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
    return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
}

llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
    return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
}

llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
    return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
}

llama_token llama_sample_token_adaptive_p(
               struct llama_context * ctx,
             llama_token_data_array * candidates,
    struct llama_sampler_adaptive_p * adapt_p_ctx) {
    return llama_sample_token_adaptive_p_impl(&ctx->sampling, candidates, adapt_p_ctx);
}

int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
    static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
    if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
        return strlen(split_path);
    }
    return 0;
}


struct llama_sampler_dry * llama_sampler_init_dry(const struct llama_vocab* vocab, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
    return llama_sampler_init_dry_impl(*vocab, vocab->n_tokens(), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
}

void llama_sampler_dry_reset(struct llama_sampler_dry* smpl) {
    if (!smpl) {
        return;
    }
    smpl->last_tokens.clear();
    smpl->dry_repeat_count.clear();
    smpl->dry_max_token_repeat.clear();
}

void llama_sampler_dry_free(struct llama_sampler_dry* smpl) {
    delete smpl;
}

struct llama_sampler_dry* llama_sampler_dry_clone(struct llama_sampler_dry* smpl) {
    if (!smpl) {
        return nullptr;
    }
    return new llama_sampler_dry {
        /* .total_context_size     = */ smpl->total_context_size,
        /* .dry_multiplier         = */ smpl->dry_multiplier,
        /* .dry_base               = */ smpl->dry_base,
        /* .dry_allowed_length     = */ smpl->dry_allowed_length,
        /* .dry_penalty_last_n     = */ smpl->dry_penalty_last_n,
        /* .dry_processed_breakers = */ smpl->dry_processed_breakers,
        /* .dry_repeat_count       = */ smpl->dry_repeat_count,
        /* .dry_max_token_repeat   = */ smpl->dry_max_token_repeat,
        /* .last_tokens            = */ smpl->last_tokens,
    };
}

void llama_sampler_dry_accept(struct llama_sampler_dry* smpl, llama_token token) {
    if (!smpl) {
        return;
    }
    if (smpl->dry_multiplier == 0.0f || smpl->dry_base < 1.0f || smpl->dry_penalty_last_n == 0) {
        return;
    }
    smpl->last_tokens.push_back(token);
}


struct llama_sampler_adaptive_p * llama_init_adaptive_p(int n_vocab, const float target, const float decay, const bool updt_w_cur, const uint32_t seed) {
    return llama_init_adaptive_p_impl(n_vocab, target, decay, updt_w_cur, seed);
}

struct llama_sampler_adaptive_p * llama_clone_adaptive_p(const struct llama_sampler_adaptive_p * adapt_p_ctx) {
    if (adapt_p_ctx == nullptr) {
        return nullptr;
    }

    return new llama_sampler_adaptive_p(*adapt_p_ctx);
}

void llama_free_adaptive_p(struct llama_sampler_adaptive_p * adapt_p_ctx) {
    delete adapt_p_ctx;
}

void llama_review_adaptive_p(struct llama_sampler_adaptive_p * adapt_p_ctx, const size_t n_unsent, const bool rewind_status) {
    llama_review_adaptive_p_impl(adapt_p_ctx, n_unsent, rewind_status);
}


int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
    std::string str_split_path(split_path);
    char postfix[32];
    snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
    std::string str_postfix(postfix);

    // check if dest ends with postfix
    int size_prefix = str_split_path.size() - str_postfix.size();
    if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
        snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
        return size_prefix;
    }

    return 0;
}

struct llama_timings llama_get_timings(struct llama_context * ctx) {
    struct llama_timings result = {
        /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
        /*.t_end_ms    =*/ 1.00 * ggml_time_ms(),
        /*.t_load_ms   =*/ 1e-3 * ctx->t_load_us,
        /*.t_sample_ms =*/ 1e-3 * ctx->sampling.t_sample_us,
        /*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,

        /*.n_sample =*/ std::max(1, ctx->sampling.n_sample),
        /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
        /*.n_eval   =*/ std::max(1, ctx->n_eval),
    };

    return result;
}

void llama_print_timings(struct llama_context * ctx) {
    const llama_timings timings = llama_get_timings(ctx);

    LLAMA_LOG_INFO("\n");
    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, timings.t_load_ms);
    LLAMA_LOG_INFO("%s:      sample time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
            __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
}

void llama_reset_timings(struct llama_context * ctx) {
    ctx->t_start_us  = ggml_time_us();
    ctx->t_eval_us   = ctx->n_eval   = 0;
    ctx->t_p_eval_us = ctx->n_p_eval = 0;

    ctx->sampling.reset_timings();
}

const char * llama_print_system_info(void) {
    static std::string s;

    s  = "";
    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
    s += "AVX_VNNI = "    + std::to_string(ggml_cpu_has_avx_vnni())    + " | ";
    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
    s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
    s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
    s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
    s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
    s += "SVE = "         + std::to_string(ggml_cpu_has_sve())         + " | ";
    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
    s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
    s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";

    return s.c_str();
}

void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
    fprintf(stream, "\n");
    fprintf(stream, "###########\n");
    fprintf(stream, "# Timings #\n");
    fprintf(stream, "###########\n");
    fprintf(stream, "\n");

    fprintf(stream, "mst_eval: %.2f  # ms / token during generation\n",
            1.0e-3 * ctx->t_eval_us / ctx->n_eval);
    fprintf(stream, "mst_p_eval: %.2f  # ms / token during prompt processing\n",
            1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
    fprintf(stream, "mst_sample: %.2f  # ms / token during sampling\n",
            1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
    fprintf(stream, "n_eval: %d  # number of tokens generated (excluding the first one)\n", ctx->n_eval);
    fprintf(stream, "n_p_eval: %d  # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
    fprintf(stream, "n_sample: %d  # number of sampled tokens\n", ctx->sampling.n_sample);
    fprintf(stream, "t_eval_us: %" PRId64 "  # total microseconds spent generating tokens\n", ctx->t_eval_us);
    fprintf(stream, "t_load_us: %" PRId64 "  # total microseconds spent loading the model\n", ctx->t_load_us);
    fprintf(stream, "t_p_eval_us: %" PRId64 "  # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
    fprintf(stream, "t_sample_us: %" PRId64 "  # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
    fprintf(stream, "ts_eval: %.2f  # tokens / second during generation\n",
            1.0e6 * ctx->n_eval / ctx->t_eval_us);
    fprintf(stream, "ts_p_eval: %.2f  # tokens / second during prompt processing\n",
            1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
    fprintf(stream, "ts_sample: %.2f  # tokens / second during sampling\n",
            1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
}

// For internal test use
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
) {
    return ctx->model.tensors_by_name;
}

void llama_log_set(ggml_log_callback log_callback, void * user_data) {
    g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
    g_state.log_callback_user_data = user_data;
#ifdef GGML_USE_METAL
    ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#elif defined(GGML_USE_CUDA)
    ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#elif defined(GGML_USE_CANN)
    ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#endif
}

static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
    va_list args_copy;
    va_copy(args_copy, args);
    char buffer[128];
    int len = vsnprintf(buffer, 128, format, args);
    if (len < 128) {
        g_state.log_callback(level, buffer, g_state.log_callback_user_data);
    } else {
        char* buffer2 = new char[len+1];
        vsnprintf(buffer2, len+1, format, args_copy);
        buffer2[len] = 0;
        g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
        delete[] buffer2;
    }
    va_end(args_copy);
}

void llama_log_internal(ggml_log_level level, const char * format, ...) {
    va_list args;
    va_start(args, format);
    llama_log_internal_v(level, format, args);
    va_end(args);
}

void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) user_data;
    fputs(text, stderr);
    fflush(stderr);
}

void llama_set_offload_policy(struct llama_context * lctx, int op, bool on_or_off) {
    if (!lctx || !lctx->sched) return;
    const char * op_name = op < 0 || op >= int(GGML_OP_COUNT) ? "all ops" : ggml_op_name(ggml_op(op));
    LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXX offload(%s) = %d\n", op_name, on_or_off);
    ggml_backend_sched_set_op_offload(lctx->sched, ggml_op(op), on_or_off);
}

void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float * hidden_state) {
    ctx->draft_input_hidden_state_owned.clear();
    ctx->draft_input_hidden_state = hidden_state;
    ctx->draft_input_hidden_state_n_floats = ctx->inp_mtp_states
        ? ggml_nbytes(ctx->inp_mtp_states) / sizeof(float)
        : 0;
}

void llama_set_mtp_target_context(struct llama_context * ctx, struct llama_context * target_ctx) {
    ctx->mtp_target_ctx = target_ctx;
}

size_t llama_fill_from_utf8(void* utf8, void* cpts, void* scripts) {
    return unicode_fill_from_utf8((std::string*)utf8, (std::vector<uint32_t>*)cpts, (std::vector<std::string>*)scripts);
}