#include "llama-impl.h"
#include "llama-vocab.h"
#include "llama-grammar.h"
#include "llama-sampling.h"
#include "llama-arch.h"
#include "llama-mmap.h"
#include "llama-model-loader.h"
#include "llama-model.h"
#include "llama-build-context.h"
#include "llama-cparams.h"
#include "llama-hparams.h"
#include "llama-context.h"
#include "llama-spec-features.h"
#include "llama-quantize.h"
#include "unicode.h"
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
void llama_set_mtp_target_context(struct llama_context * ctx, struct llama_context * target_ctx);
#include "iqk/iqk_quantize.h"
#include "iqk/iqk_cpu_ops.h"
#define IK_PRINT_TIMING 0
#ifdef GGML_USE_RPC
# include "ggml-rpc.h"
#endif
#ifdef GGML_USE_CUDA
# include "ggml-cuda.h"
#elif defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
# include "ggml-sycl.h"
#elif defined(GGML_USE_KOMPUTE)
# include "ggml-kompute.h"
#elif defined(GGML_USE_CANN)
# include "ggml-cann.h"
#endif
#ifdef GGML_USE_BLAS
# include "ggml-blas.h"
#endif
#ifdef GGML_USE_METAL
# include "ggml-metal.h"
#endif
#ifdef __has_include
#if __has_include(<unistd.h>)
#include <unistd.h>
#if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h>
#include <fcntl.h>
#endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
#endif
#endif
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#ifndef PATH_MAX
#define PATH_MAX MAX_PATH
#endif
#include <io.h>
#endif
#define LU8(x) (const char*)(u8##x)
#include <algorithm>
#include <array>
#include <cassert>
#include <cctype>
#include <cfloat>
#include <cinttypes>
#include <climits>
#include <cmath>
#include <cstdarg>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <functional>
#include <future>
#include <initializer_list>
#include <locale>
#include <map>
#include <memory>
#include <mutex>
#include <numeric>
#include <set>
#include <unordered_set>
#include <sstream>
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <regex>
#include <tuple>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267)
#endif
#define LLAMA_MAX_LAYERS 512
static bool is_utf8_whitespace(uint8_t c) {
if (c <= 0x7F) return isspace(c);
return false;
}
static std::string trim(const std::string & str) {
size_t start = 0;
size_t end = str.size();
while (start < end && is_utf8_whitespace(str[start])) start++;
while (end > start && is_utf8_whitespace(str[end - 1])) end--;
return str.substr(start, end - start);
}
static bool stop_internal_decode = false;
void llama_decode_reset() {
stop_internal_decode = false;
}
void llama_decode_stop() {
stop_internal_decode = true;
}
static std::vector<std::string> string_split(const std::string& str, const std::string& delimiter) {
std::vector<std::string> parts;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
parts.push_back(str.substr(start, end - start));
start = end + delimiter.length();
end = str.find(delimiter, start);
}
parts.push_back(str.substr(start));
return parts;
}
static std::vector<rpc_device> extract_device_from_rpc_device(std::vector<std::string> devices) {
std::vector<rpc_device> rpc_servers;
for (auto & device : devices) {
rpc_device rpc;
auto value = string_split(device, "|");
if (value.size() == 2) {
rpc.device = std::stoi(value[1]);
rpc.endpoint = value[0];
}
rpc_servers.push_back(rpc);
}
return rpc_servers;
}
enum llm_chat_template {
LLM_CHAT_TEMPLATE_CHATML,
LLM_CHAT_TEMPLATE_LLAMA_2,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
LLM_CHAT_TEMPLATE_MISTRAL_V1,
LLM_CHAT_TEMPLATE_MISTRAL_V3,
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
LLM_CHAT_TEMPLATE_MISTRAL_V7,
LLM_CHAT_TEMPLATE_PHI_3,
LLM_CHAT_TEMPLATE_FALCON_3,
LLM_CHAT_TEMPLATE_FALCON_E,
LLM_CHAT_TEMPLATE_ZEPHYR,
LLM_CHAT_TEMPLATE_MONARCH,
LLM_CHAT_TEMPLATE_GEMMA,
LLM_CHAT_TEMPLATE_ORION,
LLM_CHAT_TEMPLATE_OPENCHAT,
LLM_CHAT_TEMPLATE_VICUNA,
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
LLM_CHAT_TEMPLATE_DEEPSEEK,
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
LLM_CHAT_TEMPLATE_COMMAND_R,
LLM_CHAT_TEMPLATE_LLAMA_3,
LLM_CHAT_TEMPLATE_CHATGLM_3,
LLM_CHAT_TEMPLATE_CHATGLM_4,
LLM_CHAT_TEMPLATE_MINICPM,
LLM_CHAT_TEMPLATE_EXAONE_3,
LLM_CHAT_TEMPLATE_RWKV_WORLD,
LLM_CHAT_TEMPLATE_GRANITE,
LLM_CHAT_TEMPLATE_GIGACHAT,
LLM_CHAT_TEMPLATE_MEGREZ,
LLM_CHAT_TEMPLATE_LLAMA4,
LLM_CHAT_TEMPLATE_BITNET,
LLM_CHAT_TEMPLATE_DOTS1,
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_GROK_2,
LLM_CHAT_TEMPLATE_BAILING,
LLM_CHAT_TEMPLATE_BAILING_THINK,
LLM_CHAT_TEMPLATE_BAILING2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_UNKNOWN,
};
static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
{ "chatml", LLM_CHAT_TEMPLATE_CHATML },
{ "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
{ "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
{ "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
{ "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
{ "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
{ "falcon_e", LLM_CHAT_TEMPLATE_FALCON_E },
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
{ "orion", LLM_CHAT_TEMPLATE_ORION },
{ "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
{ "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "bitnet", LLM_CHAT_TEMPLATE_BITNET },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
{ "bailing-think", LLM_CHAT_TEMPLATE_BAILING_THINK },
{ "bailing2", LLM_CHAT_TEMPLATE_BAILING2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
};
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
switch (type) {
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
default: return format("unknown type %d", type);
}
}
std::string gguf_kv_to_str(const gguf_context * ctx_gguf, int i) {
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
int arr_n = gguf_get_arr_n(ctx_gguf, i);
const void * data = gguf_get_arr_data(ctx_gguf, i);
std::stringstream ss;
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
ss << '"' << val << '"';
} else if (arr_type == GGUF_TYPE_ARRAY) {
ss << "???";
} else {
ss << gguf_data_to_str(arr_type, data, j);
}
if (j < arr_n - 1) {
ss << ", ";
}
}
ss << "]";
return ss.str();
}
default:
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
}
}
ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_CUDA)
if (host_buffer) {
buft = ggml_backend_cuda_host_buffer_type();
}
#elif defined(GGML_USE_SYCL)
if (host_buffer) {
buft = ggml_backend_sycl_host_buffer_type();
}
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#elif defined(GGML_USE_VULKAN)
if (host_buffer) {
buft = ggml_backend_vk_host_buffer_type();
}
#endif
if (buft == nullptr) {
buft = ggml_backend_cpu_buffer_type();
}
return buft;
GGML_UNUSED(host_buffer);
}
struct llama_state {
llama_state() {
#ifdef GGML_USE_METAL
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
#elif defined(GGML_USE_CUDA)
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
#elif defined(GGML_USE_CANN)
ggml_backend_cann_log_set_callback(log_callback, log_callback_user_data);
#endif
}
ggml_log_callback log_callback = llama_log_callback_default;
void * log_callback_user_data = nullptr;
};
static llama_state g_state;
static const size_t kiB = 1024;
static const size_t MiB = 1024*kiB;
static const size_t GiB = 1024*MiB;
static const char * llama_expert_gating_func_name(llm_expert_gating_func_type type) {
switch (type) {
case LLM_EXPERT_GATING_FUNC_SOFTMAX: return "softmax";
case LLM_EXPERT_GATING_FUNC_SIGMOID: return "sigmoid";
case LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT: return "softmax_weight";
default: return "unknown";
}
}
llama_model::~llama_model() {
for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
#ifdef GGML_USE_CUDA
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
}
#endif
ggml_backend_buffer_free(buf);
}
while (!lora_adapters.empty()) {
llama_lora_adapter_free(*lora_adapters.begin());
}
}
static size_t llama_get_device_count(const llama_model & model) {
size_t count = 1;
#if defined(GGML_USE_CUDA)
count = ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
count = ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
count = ggml_backend_vk_get_device_count();
#elif defined(GGML_USE_CANN)
return ggml_backend_cann_get_device_count();
#endif
#if defined(GGML_USE_RPC)
count += model.rpc_servers.size();
#endif
return count;
GGML_UNUSED(model);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr;
#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size();
if (gpu >= dev_count - rpc_count) {
int rpc_idx = gpu - dev_count + rpc_count;
rpc_device rpc = model.rpc_servers[rpc_idx];
const char * endpoint = rpc.endpoint.c_str();
return ggml_backend_rpc_buffer_type(endpoint, rpc.device);
}
#endif
#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#elif defined(GGML_USE_CANN)
buft = ggml_backend_cann_buffer_type(gpu);
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(true);
}
return buft;
GGML_UNUSED(model);
GGML_UNUSED(gpu);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(model.splits.data());
}
#endif
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(model.splits.data());
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(model, fallback_gpu);
}
return buft;
}
int llama_model::device_count() const {
return llama_get_device_count(*this);
}
ggml_backend_buffer_type_t llama_model::default_buffer_type_offload(int device) const {
return llama_default_buffer_type_offload(*this, device);
}
static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
int dev_count = (int)llama_get_device_count(model);
int rpc_count = (int)model.rpc_servers.size();
if (device >= dev_count - rpc_count) {
size_t total;
size_t free;
rpc_device rpc = model.rpc_servers[device - dev_count + rpc_count];
const char * endpoint = rpc.endpoint.c_str();
ggml_backend_rpc_get_device_memory(endpoint, rpc.device, &free, &total);
return free;
}
#endif
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_CANN)
size_t total;
size_t free;
ggml_backend_cann_get_device_memory(device, &free, &total);
return free;
#else
return 1;
#endif
GGML_UNUSED(model);
GGML_UNUSED(device);
}
struct llama_context::Prev {
int all_seq_id;
int n_outputs;
int n_kv;
int n_tokens;
int save_per_step_ssm;
int per_step_max_allocated;
llama_mtp_op_type mtp_op_type;
ggml_cgraph * graph;
};
void llama_context::reset_scheduler() {
ggml_backend_sched_reset(sched);
prev.reset();
prev_mtp.reset();
}
bool llama_context::can_reuse_graph(const llama_batch & u_batch) {
if (!cparams.graph_reuse) return false;
if ((model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && mtp_target_ctx != nullptr) return false;
auto the_prev = cparams.mtp_op_type == MTP_OP_NONE ? prev.get() : prev_mtp.get();
if (!the_prev || !the_prev->graph) return false;
if (u_batch.embd) return false;
if (the_prev->save_per_step_ssm != kv_self.save_per_step_ssm ||
the_prev->per_step_max_allocated != kv_self.ckpt.per_step_max_allocated) return false;
return u_batch.all_seq_id == the_prev->all_seq_id &&
kv_self.head > 0 &&
kv_self.n == the_prev->n_kv &&
n_outputs == the_prev->n_outputs &&
u_batch.n_tokens == the_prev->n_tokens &&
cparams.mtp_op_type == the_prev->mtp_op_type &&
update_cache_copies();
}
bool llama_context::update_cache_copies() {
const int n_layer = model.mtp && cparams.mtp_op_type != MTP_OP_NONE ?
model.hparams.n_layer : model.hparams.n_layer - model.hparams.nextn_predict_layers; auto layer_has_attention_kv = [&](int il) {
return !model.hparams.is_recurrent(il);
};
if ((int)kv_self.k_l.size() < n_layer) {
return false;
}
if (!kv_self.v_l.empty() && (int)kv_self.v_l.size() < n_layer) {
return false;
}
for (int il = 0; il < n_layer; ++il) {
if (!layer_has_attention_kv(il) || kv_self.k_l[il] == nullptr) {
continue;
}
auto kl = (ggml_split_tensor_t *)kv_self.k_l[il]->extra;
if (kl) {
GGML_ASSERT(model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN);
GGML_ASSERT(model.splits.size() > 1);
auto vl = !kv_self.v_l.empty() && kv_self.v_l[il] ? (ggml_split_tensor_t *)kv_self.v_l[il]->extra : nullptr;
GGML_ASSERT(kl && (kv_self.v_l.empty() || !kv_self.v_l[il] || vl));
if (vl) {
GGML_ASSERT(kl->n_device == vl->n_device);
}
for (int id = 0; id < kl->n_device; ++id) {
if (!kl->splits[id]) continue;
size_t idx = 2*model.splits.size()*il + 2*id + 0;
auto& c = cache_copies[idx];
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kl->splits[id]) {
return false;
}
c.cpy->view_offs = kv_self.head*c.step;
c.cpy->src[1]->data = (char *)kl->splits[id]->data + c.cpy->view_offs;
c.cpy->data = c.cpy->src[1]->data;
}
if (!vl) continue;
for (int id = 0; id < vl->n_device; ++id) {
if (!vl->splits[id]) continue;
auto& c = cache_copies[2*model.splits.size()*il + 2*id + 1];
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != vl->splits[id]) {
return false;
}
c.cpy->view_offs = kv_self.head*c.step;
c.cpy->src[1]->data = (char *)vl->splits[id]->data + c.cpy->view_offs;
c.cpy->data = c.cpy->src[1]->data;
}
} else {
auto& c = cache_copies[2*il+0];
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.k_l[il]) {
return false;
}
c.cpy->view_offs = kv_self.head*c.step;
c.cpy->src[1]->data = (char *)kv_self.k_l[il]->data + c.cpy->view_offs;
c.cpy->data = c.cpy->src[1]->data;
if (!kv_self.v_l.empty() && kv_self.v_l[il]) {
auto& c = cache_copies[2*il+1];
if (!c.cpy || c.cpy->op != GGML_OP_CPY || c.cpy->view_src != kv_self.v_l[il]) {
return false;
}
c.cpy->view_offs = kv_self.head*c.step;
c.cpy->src[1]->data = (char *)kv_self.v_l[il]->data + c.cpy->view_offs;
c.cpy->data = c.cpy->src[1]->data;
}
}
}
return true;
}
llama_context::llama_context(const llama_model & model)
: model(model) , sampling(llama_n_vocab(&model)) , t_start_us(model.t_start_us) , t_load_us(model.t_load_us) {
const auto & hparams = model.hparams;
if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) {
cache_copies.resize(2*model.splits.size()*hparams.n_layer);
} else {
cache_copies.resize(2*hparams.n_layer);
}
}
void llama_context::set_mtp_op_type(llama_mtp_op_type value) {
LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
cparams.mtp_op_type = value;
}
llama_context::~llama_context() {
ggml_backend_sched_free(sched);
for (ggml_backend_t backend : backends) {
ggml_backend_free(backend);
}
ggml_backend_buffer_free(buf_output);
}
int llama_context::max_nodes(int n_tokens, int n_kv) const {
int max_nodes = model.max_nodes(n_tokens);
if (model.is_mla_model() &&
cparams.mla_attn > 1 &&
n_tokens >= 128 &&
cparams.attn_max_batch > 0 &&
model.split_mode != LLAMA_SPLIT_MODE_GRAPH &&
model.layers[0].wkv_b) {
int n_head = model.hparams.n_head();
auto wkv_b = model.layers[0].wkv_b;
auto kv_f32_size = wkv_b->ne[1] * n_kv * sizeof(float) / (1024*1024);
if (cparams.attn_max_batch < kv_f32_size) {
int n_max_head = 1;
for (int niter = 2; niter < n_head; ++niter) {
if (n_head % niter == 0 && kv_f32_size/niter <= cparams.attn_max_batch) {
n_max_head = n_head/niter;
break;
}
}
GGML_ASSERT(n_head % n_max_head == 0);
int n_iter = n_head / n_max_head;
max_nodes += (n_iter - 1) * 10 * model.hparams.n_layer;
}
}
return max_nodes;
}
static inline bool llama_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) {
return hparams.is_recurrent(il);
}
static inline uint32_t llama_kv_v_row_embd(
const llama_model & model,
const llama_hparams & hparams,
uint32_t il) {
if (llm_arch_is_hybrid(model.arch)) {
return hparams.n_embd_v_gqa(il);
}
return hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
}
static inline uint32_t llama_qwen3next_state_slots(const llama_cparams & cparams, uint32_t kv_size) {
return std::min<uint32_t>(std::max<uint32_t>(1, cparams.n_seq_max), kv_size);
}
static inline uint32_t llama_kv_qnext_state_slots(const llama_kv_cache & cache) {
uint32_t n_slots = 0;
for (const ggml_tensor * t : cache.s_l) {
if (t == nullptr) {
continue;
}
const uint32_t layer_slots = (uint32_t) t->ne[1];
if (n_slots == 0) {
n_slots = layer_slots;
} else {
GGML_ASSERT(n_slots == layer_slots);
}
}
return n_slots;
}
static inline bool llama_kv_has_qnext_state_storage(const llama_kv_cache & cache) {
return llama_kv_qnext_state_slots(cache) > 0;
}
static inline bool llama_kv_qnext_seq_id_in_range(const llama_kv_cache & cache, llama_seq_id seq_id) {
const uint32_t n_slots = llama_kv_qnext_state_slots(cache);
return n_slots > 0 && seq_id >= 0 && (uint32_t) seq_id < n_slots;
}
static bool llama_kv_cache_init(
struct llama_kv_cache & cache,
const llama_context * ctx,
ggml_type type_k,
ggml_type type_v,
uint32_t kv_size,
bool offload,
ggml_type type_k_first,
ggml_type type_k_last,
ggml_type type_v_first,
ggml_type type_v_last,
int32_t n_k_first,
int32_t n_k_last,
int32_t n_v_first,
int32_t n_v_last) {
const llama_model & model = ctx->model;
const llama_cparams & cparams = ctx->cparams;
const struct llama_hparams & hparams = model.hparams;
const int64_t n_layer = model.mtp ? hparams.n_layer
: hparams.n_layer - hparams.nextn_predict_layers;
cache.has_shift = false;
cache.recurrent = llm_arch_is_recurrent(model.arch);
cache.hybrid = llm_arch_is_hybrid(model.arch);
cache.v_trans = !cache.recurrent && !cparams.flash_attn && !llm_arch_is_hybrid(model.arch);
cache.head = 0;
cache.size = kv_size;
cache.used = 0;
cache.type_k = type_k;
cache.type_v = type_v;
cache.cells.clear();
cache.cells.resize(kv_size);
if (cache.recurrent || llm_arch_is_hybrid(model.arch)) {
for (uint32_t i = 0; i < cache.size; ++i) {
cache.cells[i].src = i;
}
}
bool is_mla_attn = model.is_mla_model();
bool split_cache = false;
bool replicate_mla = false;
if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && !is_mla_attn && offload) {
cache.split_k_l.reserve(n_layer);
cache.split_v_l.reserve(n_layer);
if (llama_model_has_recurrent(&model)) {
cache.split_s_l.reserve(n_layer);
}
split_cache = true;
}
if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && is_mla_attn && offload) {
cache.replicated_k_l.reserve(n_layer);
replicate_mla = true;
}
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
if (offload) {
const bool is_mtp = (model.arch == LLM_ARCH_GLM_DSA ||
model.arch == LLM_ARCH_QWEN35 ||
model.arch == LLM_ARCH_QWEN35MOE) && hparams.nextn_predict_layers > 0;
const int64_t n_mtp_first = hparams.n_layer - hparams.nextn_predict_layers;
for (int64_t i = 0; i < n_layer; ++i) {
const bool is_mtp_tail = is_mtp && i >= n_mtp_first;
if ((split_cache || replicate_mla) && !is_mtp_tail) {
buft_layer_count[model.buft_layer[i].buft_matrix]++;
if (model.buft_layer[i].buft != model.buft_layer[i].buft_matrix) {
buft_layer_count[model.buft_layer[i].buft]++;
}
} else {
buft_layer_count[model.buft_layer[i].buft]++;
}
}
} else {
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
}
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
for (auto & it : buft_layer_count) {
int n_layers = it.second;
size_t ctx_mem_size = 8u*n_layers*ggml_tensor_overhead();
if (split_cache || replicate_mla) ctx_mem_size += 4*model.splits.size()*n_layers*ggml_tensor_overhead();
struct ggml_init_params params = {
ctx_mem_size,
NULL,
true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
return false;
}
ctx_map[it.first] = ctx;
cache.ctxs.push_back(ctx);
}
if (is_mla_attn) {
int n_have_wkv_b = 0;
for (auto& l : model.layers) {
if (l.wkv_b || l.wk_b_pp) {
++n_have_wkv_b;
}
}
bool have_wkv_b = n_have_wkv_b > 0;
if (!have_wkv_b) {
if (cparams.mla_attn != 1) {
LLAMA_LOG_WARN("=========================================================\n");
LLAMA_LOG_WARN("%s: missing wkv_b tensor(s)\n", __func__);
LLAMA_LOG_WARN("%s: changing MLA from %d to 1\n", __func__, cparams.mla_attn);
if (cparams.mla_attn > 1) {
LLAMA_LOG_WARN("%s: ** Prompt processing performance will be crippled **\n", __func__);
}
LLAMA_LOG_WARN("=========================================================\n");
auto& non_cparams = const_cast<llama_cparams&>(cparams);
non_cparams.mla_attn = 1;
}
}
}
bool needs_v_cache = true;
cache.k_l.reserve(n_layer);
if (is_mla_attn && cparams.mla_attn) {
needs_v_cache = cparams.mla_attn == 1 && !cparams.flash_attn;
}
if (needs_v_cache) cache.v_l.reserve(n_layer);
cache.s_l.resize(n_layer, nullptr);
std::vector<size_t> mem_split(model.splits.size(), 0);
const uint32_t qnext_state_slots = llama_qwen3next_state_slots(cparams, kv_size);
if (llm_arch_is_hybrid(model.arch) && qnext_state_slots < std::max<uint32_t>(1, cparams.n_seq_max)) {
LLAMA_LOG_WARN("%s: reducing qwen3next state slots from %u to %u to fit KV cache size\n",
__func__, std::max<uint32_t>(1, cparams.n_seq_max), qnext_state_slots);
}
int n_mla = 0;
int n_kv_active_layers = 0;
const int64_t n_mtp_first_layer = hparams.n_layer - hparams.nextn_predict_layers;
for (int i = 0; i < (int) n_layer; i++) {
if (cparams.mtp_op_type != MTP_OP_NONE && i < (int)n_mtp_first_layer) {
cache.k_l.push_back(nullptr);
if (!is_mla_attn || !cparams.mla_attn || (cparams.mla_attn == 1 && !cparams.flash_attn)) {
cache.v_l.push_back(nullptr);
}
continue;
}
n_kv_active_layers++;
const bool qnext_recurrent = llama_is_recurrent_layer(hparams, i);
const uint32_t n_embd_v_row = llama_kv_v_row_embd(model, hparams, i);
const uint32_t n_head_kv = hparams.n_head_kv(i);
const uint32_t n_embd_head_k= hparams.n_embd_head_k(i);
const bool is_mtp_tail_layer = (model.arch == LLM_ARCH_QWEN35 ||
model.arch == LLM_ARCH_QWEN35MOE ||
model.arch == LLM_ARCH_GLM_DSA) &&
hparams.nextn_predict_layers > 0 && i >= (int)n_mtp_first_layer;
struct ggml_context * ctx = ((split_cache || replicate_mla) && !is_mtp_tail_layer) ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
ggml_tensor * k = nullptr;
ggml_tensor * v = nullptr;
ggml_tensor * s = nullptr;
if (is_mla_attn && cparams.mla_attn) {
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
ggml_type primary_kv_type = cparams.flash_attn ? cache.type_k
: (cparams.mla_attn == 1 ? cache.type_k : cache.type_v);
ggml_tensor * kv = ggml_new_tensor_2d(ctx, primary_kv_type, kv_lora_rank + n_embd_head_qk_rope, kv_size);
ggml_format_name(kv, "cache_k_l%d", i);
cache.k_l.push_back(kv);
if (!cparams.flash_attn && cparams.mla_attn == 1) {
ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_v, kv_lora_rank*kv_size);
ggml_format_name(kvt, "cache_v_l%d", i);
cache.v_l.push_back(kvt);
}
if (replicate_mla && !is_mtp_tail_layer && model.layers[i].wo && model.layers[i].wo->extra) {
auto wo = model.layers[i].wo;
auto extra_wo = (const ggml_split_tensor_t *)wo->extra;
int n_device = extra_wo->n_device;
auto & repl_k_l = cache.replicated_k_l.emplace_back();
repl_k_l.tensor_splits.resize(n_device, nullptr);
for (int is = 0; is < n_device; ++is) {
if (!extra_wo->splits[is]) continue;
ggml_tensor * rkv = ggml_new_tensor_2d(ctx, primary_kv_type,
kv_lora_rank + n_embd_head_qk_rope, kv_size);
auto split_name = std::string("cache_k_l") + std::to_string(i) + '.' + std::to_string(is);
ggml_set_name(rkv, split_name.c_str());
repl_k_l.tensor_splits[is] = rkv;
mem_split[is] += ggml_nbytes(rkv);
}
repl_k_l.ggml.n_device = n_device;
repl_k_l.ggml.split_dim = -1;
repl_k_l.ggml.splits = repl_k_l.tensor_splits.data();
kv->extra = (void *)&repl_k_l.ggml;
}
n_mla++;
}
else {
const bool is_mtp_layer = (cparams.mtp_op_type != MTP_OP_NONE && i >= (int)n_mtp_first_layer);
if (!hparams.has_kv(i) && !is_mtp_layer) {
cache.k_l.push_back(nullptr);
cache.v_l.push_back(nullptr);
continue;
}
if (qnext_recurrent) {
s = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hparams.n_embd_v_s(), qnext_state_slots);
auto s_name = std::string{"cache_s_l"} + std::to_string(i);
ggml_set_name(s, s_name.c_str());
cache.s_l[i] = s;
cache.k_l.push_back(nullptr);
cache.v_l.push_back(nullptr);
LLAMA_LOG_DEBUG("=== Created recurrent cache %s as %ld x %ld x %ld x %ld\n", s->name, s->ne[0], s->ne[1], s->ne[2], s->ne[3]);
if (split_cache && model.layers[i].ssm_out->extra) {
auto split_ssm_out = (const ggml_split_tensor_t *)model.layers[i].ssm_out->extra;
GGML_ASSERT(split_ssm_out);
int num_v_heads = hparams.ssm_dt_rank;
int head_v_dim = hparams.ssm_d_inner / num_v_heads;
int n_device = split_ssm_out->n_device;
auto & split_s_l = cache.split_s_l.emplace_back();
split_s_l.tensor_splits.resize(n_device, nullptr);
for (int is = 0; is < n_device; ++is) {
auto split = split_ssm_out->splits[is];
if (!split) continue;
GGML_ASSERT(split->ne[0] % head_v_dim == 0);
int nv = split->ne[0] / head_v_dim;
auto size = hparams.n_embd_v_s_id(nv);
split_s_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, size, qnext_state_slots);
auto split_name = s_name + '.' + std::to_string(is);
ggml_set_name(split_s_l.tensor_splits[is], split_name.c_str());
mem_split[is] += ggml_nbytes(split_s_l.tensor_splits[is]);
}
split_s_l.ggml.n_device = n_device;
split_s_l.ggml.split_dim = 0;
split_s_l.ggml.splits = split_s_l.tensor_splits.data();
cache.s_l[i]->extra = (void *)&split_s_l.ggml;
}
continue;
}
bool split_cache_i = split_cache;
auto K = model.layers[i].wk;
auto V = model.layers[i].wv;
if (!V && model.arch == LLM_ARCH_GEMMA4) {
V = K;
}
if (split_cache && (!K || !V || !K->extra || !V->extra)) {
ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
split_cache_i = false;
}
int n_embd_head_v = hparams.n_embd_head_v(i);
auto this_type_k = type_k;
if (type_k_first != type_k && n_k_first > 0 && i < n_k_first) {
this_type_k = type_k_first;
}
if (type_k_last != type_k && n_k_last > 0 && i >= n_layer - n_k_last) {
this_type_k = type_k_last;
}
if (this_type_k != type_k) {
LLAMA_LOG_INFO("================= Setting K-cache type in layer %2d to %s\n", i, ggml_type_name(this_type_k));
}
k = ggml_new_tensor_2d(ctx, this_type_k, n_embd_head_k, n_head_kv*kv_size);
int64_t v_ne = int64_t(n_embd_v_row)*kv_size;
auto this_type_v = type_v;
if (type_v_first != type_v && n_v_first > 0 && i < n_v_first) {
this_type_v = type_v_first;
}
if (type_v_last != type_v && n_v_last > 0 && i >= n_layer - n_v_last) {
this_type_v = type_v_last;
}
if (this_type_v != type_v) {
LLAMA_LOG_INFO("================= Setting V-cache type in layer %2d to %s\n", i, ggml_type_name(this_type_v));
}
v = ggml_new_tensor_1d(ctx, this_type_v, v_ne);
auto k_name = std::string{"cache_k_l"} + std::to_string(i);
auto v_name = std::string{"cache_v_l"} + std::to_string(i);
ggml_set_name(k, k_name.c_str());
ggml_set_name(v, v_name.c_str());
if (split_cache_i) {
bool use_V_for_K = model.layers[i].attn_k_norm && model.layers[i].attn_k_norm->ne[0] == K->ne[1] ? true : false;
auto extra_K = (const ggml_split_tensor_t *)K->extra;
auto extra_V = (const ggml_split_tensor_t *)V->extra;
auto & split_k_l = cache.split_k_l.emplace_back();
auto & split_v_l = cache.split_v_l.emplace_back();
split_k_l.tensor_splits.resize(extra_K->n_device, nullptr);
split_v_l.tensor_splits.resize(extra_V->n_device, nullptr);
for (int is = 0; is < extra_K->n_device; ++is) {
auto split = use_V_for_K ? extra_V->splits[is] : extra_K->splits[is];
if (!split) continue;
int nhead_kv = use_V_for_K ? split->ne[1] / n_embd_head_v : split->ne[1]/n_embd_head_k;
if (use_V_for_K) {
LLAMA_LOG_DEBUG("K_cache(%d, %d): using %d instead of %ld heads\n",
i, is, nhead_kv, extra_K->splits[is]->ne[1]/n_embd_head_k);
}
split_k_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, this_type_k, n_embd_head_k, nhead_kv * kv_size);
auto split_name = k_name + '.' + std::to_string(is);
ggml_set_name(split_k_l.tensor_splits[is], split_name.c_str());
mem_split[is] += ggml_nbytes(split_k_l.tensor_splits[is]);
}
split_k_l.ggml.n_device = extra_K->n_device;
split_k_l.ggml.split_dim = 0;
split_k_l.ggml.splits = split_k_l.tensor_splits.data();
for (int is = 0; is < extra_V->n_device; ++is) {
auto split = extra_V->splits[is];
if (!split) continue;
split_v_l.tensor_splits[is] = ggml_new_tensor_1d(ctx, this_type_v, split->ne[1] * kv_size);
auto split_name = v_name + '.' + std::to_string(is);
ggml_set_name(split_v_l.tensor_splits[is], split_name.c_str());
mem_split[is] += ggml_nbytes(split_v_l.tensor_splits[is]);
}
split_v_l.ggml.n_device = extra_V->n_device;
split_v_l.ggml.split_dim = 0;
split_v_l.ggml.splits = split_v_l.tensor_splits.data();
k->extra = (void *)&split_k_l.ggml;
v->extra = (void *)&split_v_l.ggml;
}
cache.k_l.push_back(k);
cache.v_l.push_back(v);
}
}
if (is_mla_attn && cparams.mla_attn && n_mla < n_kv_active_layers && n_mla > 0) {
LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d active KV layers having MLA enabled\n", __func__, n_mla, n_kv_active_layers);
LLAMA_LOG_ERROR("%s: bailing out\n", __func__);
GGML_ABORT("fatal error");
}
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
int ntensor = 0;
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
++ntensor;
}
if (ntensor > 0) {
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
return false;
}
ggml_backend_buffer_clear(buf, 0);
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
cache.bufs.push_back(buf);
}
}
if (split_cache || replicate_mla) {
LLAMA_LOG_INFO("%s: KV cache size per device%s:\n", __func__,
replicate_mla ? " (MLA replicated)" : "");
for (int i = 0; i < int(mem_split.size()); ++i) LLAMA_LOG_INFO(" Device %d: %g MiB\n", i, mem_split[i]/1024./1024.);
}
#if 0#endif
return true;
}
static bool llama_kv_cache_find_slot(
struct llama_kv_cache & cache,
const struct llama_batch & batch) {
const uint32_t n_tokens = batch.n_tokens;
if (cache.recurrent) {
llama_seq_id min = cache.size - 1;
llama_seq_id max = 0;
for (uint32_t i = 0; i < n_tokens; ++i) {
for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) {
llama_seq_id seq_id = batch.seq_id[i][j];
if ((uint32_t) seq_id < cache.size) {
if (seq_id > max) {
max = seq_id;
}
if (seq_id < min) {
min = seq_id;
}
if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
__func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
}
if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) {
cache.used += 1;
}
cache.cells[seq_id].pos = batch.pos[i];
} else {
LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
return false;
}
}
}
cache.head = min;
cache.n = max - min + 1;
return max >= min;
}
if (n_tokens > cache.size) {
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
return false;
}
uint32_t n_tested = 0;
while (true) {
if (cache.head + n_tokens > cache.size) {
n_tested += cache.size - cache.head;
cache.head = 0;
continue;
}
bool found = true;
for (uint32_t i = 0; i < n_tokens; i++) {
if (cache.cells[cache.head + i].pos >= 0) {
found = false;
cache.head += i + 1;
n_tested += i + 1;
break;
}
}
if (found) {
break;
}
if (n_tested >= cache.size) {
return false;
}
}
for (uint32_t i = 0; i < n_tokens; i++) {
cache.cells[cache.head + i].pos = batch.pos[i];
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
}
}
cache.used += n_tokens;
return true;
}
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
for (uint32_t i = cache.size; i > 0; --i) {
const llama_kv_cell & cell = cache.cells[i - 1];
if (cell.pos >= 0 && !cell.is_empty()) {
return i;
}
}
return 0;
}
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache, uint32_t pad) {
uint32_t last = cache.size;
for (uint32_t i = cache.size; i > 0; i -= pad) {
const llama_kv_cell & cell = cache.cells[i - 1];
if (cell.pos >= 0 && !cell.is_empty()) {
for (uint32_t j = last; j > i; --j) {
auto & cell_j = cache.cells[j - 1];
if (cell_j.pos >= 0 && !cell_j.is_empty()) return j;
}
return i;
}
last = i;
}
return 0;
}
bool llama_kv_cache::checkpoint_supported() const {
for (const auto * s : s_l) {
if (s != nullptr) {
return true;
}
}
return false;
}
bool llama_kv_cache::checkpoint_alloc_shadows(bool conv_only_shadow) {
if (ckpt.allocated) {
if (ckpt.shadow_conv_only != conv_only_shadow) {
LLAMA_LOG_ERROR("%s: requested %s shadow buffers, but %s shadow buffers are already allocated\n",
__func__,
conv_only_shadow ? "conv-state-only" : "full-state",
ckpt.shadow_conv_only ? "conv-state-only" : "full-state");
return false;
}
return true;
}
const uint32_t n_layer = (uint32_t)s_l.size();
ckpt.s_l_shadow.resize(n_layer, nullptr);
struct tensor_entry {
ggml_tensor * primary;
uint32_t il;
int split_idx; };
std::map<ggml_backend_buffer_type_t, std::vector<tensor_entry>> buft_entries;
for (uint32_t il = 0; il < n_layer; ++il) {
if (s_l[il] == nullptr) {
continue;
}
auto * extra = s_l[il]->extra;
if (extra != nullptr) {
auto * split_info = (const ggml_split_tensor_t *)extra;
for (int d = 0; d < split_info->n_device; ++d) {
if (split_info->splits[d] == nullptr) continue;
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(split_info->splits[d]->buffer);
buft_entries[buft].push_back({split_info->splits[d], il, d});
}
} else {
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(s_l[il]->buffer);
buft_entries[buft].push_back({s_l[il], il, -1});
}
}
for (auto & [buft, entries] : buft_entries) {
ggml_init_params params = {
entries.size() * ggml_tensor_overhead(),
NULL,
true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to create ggml context for shadow tensors\n", __func__);
return false;
}
for (auto & entry : entries) {
ggml_tensor * shadow = nullptr;
if (conv_only_shadow && entry.split_idx < 0) {
shadow = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ckpt.per_step_conv_state_dim);
} else {
shadow = ggml_dup_tensor(ctx, entry.primary);
}
if (entry.split_idx >= 0) {
ggml_format_name(shadow, "shadow_s_l%d_d%d", entry.il, entry.split_idx);
} else {
ggml_format_name(shadow, "shadow_s_l%d", entry.il);
}
entry.primary = shadow;
}
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for shadow tensors\n", __func__);
ggml_free(ctx);
return false;
}
ggml_backend_buffer_clear(buf, 0);
LLAMA_LOG_INFO("%s: %10s shadow buffer = %8.2f MiB%s\n", __func__,
ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0,
conv_only_shadow ? " (conv-state only)" : "");
ckpt.shadow_ctxs.push_back(ctx);
ckpt.shadow_bufs.push_back(buf);
for (const auto & entry : entries) {
if (entry.split_idx < 0) {
ckpt.s_l_shadow[entry.il] = entry.primary;
}
}
}
ckpt.split_s_l_shadow.resize(n_layer);
for (uint32_t il = 0; il < n_layer; ++il) {
if (s_l[il] == nullptr || s_l[il]->extra == nullptr) {
continue;
}
auto * split_info = (const ggml_split_tensor_t *)s_l[il]->extra;
auto & shadow_split = ckpt.split_s_l_shadow[il];
shadow_split.resize(split_info->n_device, nullptr);
for (int d = 0; d < split_info->n_device; ++d) {
if (split_info->splits[d] == nullptr) continue;
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(split_info->splits[d]->buffer);
for (auto & entry : buft_entries[buft]) {
if (entry.il == il && entry.split_idx == d) {
shadow_split[d] = entry.primary;
break;
}
}
}
}
ckpt.shadow_conv_only = conv_only_shadow;
ckpt.allocated = true;
return true;
}
bool llama_kv_cache::checkpoint_save(ggml_backend_sched_t sched) {
if (!checkpoint_alloc_shadows(false)) {
return false;
}
GGML_ASSERT(!ckpt.shadow_conv_only);
const uint32_t n_layer = (uint32_t)s_l.size();
ckpt.cells_snapshot = cells;
ckpt.head_snapshot = head;
ckpt.used_snapshot = used;
std::unordered_set<ggml_backend_t> backends_to_sync;
for (uint32_t il = 0; il < n_layer; ++il) {
if (s_l[il] == nullptr) {
continue;
}
if (s_l[il]->extra != nullptr) {
auto * split_info = (const ggml_split_tensor_t *)s_l[il]->extra;
auto & shadow_split = ckpt.split_s_l_shadow[il];
for (int d = 0; d < split_info->n_device; ++d) {
if (split_info->splits[d] && shadow_split[d]) {
auto src_backend = ggml_backend_sched_get_tensor_backend(sched, split_info->splits[d]);
ggml_backend_tensor_copy_async(src_backend, src_backend, split_info->splits[d], shadow_split[d]);
backends_to_sync.insert(src_backend);
}
}
} else {
GGML_ASSERT(ckpt.s_l_shadow[il] != nullptr);
auto src_backend = ggml_backend_sched_get_tensor_backend(sched, s_l[il]);
GGML_ASSERT(src_backend != nullptr);
ggml_backend_tensor_copy_async(src_backend, src_backend, s_l[il], ckpt.s_l_shadow[il]);
backends_to_sync.insert(src_backend);
}
}
for (auto backend : backends_to_sync) {
ggml_backend_synchronize(backend);
}
ckpt.saved = true;
return true;
}
bool llama_kv_cache::checkpoint_restore(ggml_backend_sched_t sched) {
if (!ckpt.saved) {
LLAMA_LOG_ERROR("%s: no checkpoint saved\n", __func__);
return false;
}
GGML_ASSERT(!ckpt.shadow_conv_only);
const uint32_t n_layer = (uint32_t)s_l.size();
cells = ckpt.cells_snapshot;
head = ckpt.head_snapshot;
used = ckpt.used_snapshot;
std::unordered_set<ggml_backend_t> backends_to_sync;
for (uint32_t il = 0; il < n_layer; ++il) {
if (s_l[il] == nullptr) {
continue;
}
if (s_l[il]->extra != nullptr) {
auto * split_info = (const ggml_split_tensor_t *)s_l[il]->extra;
auto & shadow_split = ckpt.split_s_l_shadow[il];
for (int d = 0; d < split_info->n_device; ++d) {
if (split_info->splits[d] && shadow_split[d]) {
auto dst_backend = ggml_backend_sched_get_tensor_backend(sched, split_info->splits[d]);
ggml_backend_tensor_copy_async(dst_backend, dst_backend, shadow_split[d], split_info->splits[d]);
backends_to_sync.insert(dst_backend);
}
}
} else {
GGML_ASSERT(ckpt.s_l_shadow[il] != nullptr);
GGML_ASSERT(ggml_nbytes(ckpt.s_l_shadow[il]) == ggml_nbytes(s_l[il]));
auto dst_backend = ggml_backend_sched_get_tensor_backend(sched, s_l[il]);
GGML_ASSERT(dst_backend != nullptr);
ggml_backend_tensor_copy_async(dst_backend, dst_backend, ckpt.s_l_shadow[il], s_l[il]);
backends_to_sync.insert(dst_backend);
}
}
for (auto backend : backends_to_sync) {
ggml_backend_synchronize(backend);
}
return true;
}
void llama_kv_cache::checkpoint_delete() {
ckpt.saved = false;
}
bool llama_kv_cache::per_step_alloc(const llama_model & model, int max_tokens) {
if (ckpt.per_step_max_allocated >= max_tokens) {
return true;
}
if (!ckpt.per_step_ssm.empty()) {
for (struct ggml_context * ctx : ckpt.per_step_ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : ckpt.per_step_bufs) {
ggml_backend_buffer_free(buf);
}
ckpt.per_step_ctxs.clear();
ckpt.per_step_bufs.clear();
ckpt.per_step_ssm.clear();
ckpt.per_step_conv.clear();
ckpt.per_step_max_allocated = 0;
}
const uint32_t n_layer = (uint32_t)s_l.size();
ckpt.per_step_ssm.resize(n_layer);
ckpt.per_step_conv.resize(n_layer);
const int64_t ssm_state_dim = ckpt.per_step_ssm_state_size;
const int64_t conv_dim = ckpt.per_step_conv_dim;
const int64_t conv_state_dim = ckpt.per_step_conv_state_dim;
if (ssm_state_dim <= 0 || conv_dim <= 0) {
LLAMA_LOG_ERROR("%s: per_step dimensions not set (ssm=%lld, conv_dim=%lld)\n",
__func__, (long long)ssm_state_dim, (long long)conv_dim);
return false;
}
int num_v_heads = model.hparams.ssm_dt_rank;
int head_v_dim = model.hparams.ssm_d_inner / num_v_heads;
int d_conv = model.hparams.ssm_d_conv;
std::map<ggml_backend_buffer_type_t, std::vector<std::pair<std::pair<uint32_t, int32_t>, ggml_backend_buffer_type_t>>> buft_layers;
for (uint32_t il = 0; il < n_layer; ++il) {
if (s_l[il] == nullptr) continue;
if (s_l[il]->extra) {
auto split_sl = (ggml_split_tensor_t *)s_l[il]->extra;
for (int id = 0; id < split_sl->n_device; ++id) {
if (!split_sl->splits[id]) continue;
auto buft = ggml_backend_buffer_get_type(split_sl->splits[id]->buffer);
GGML_ASSERT(buft != nullptr);
buft_layers[buft].push_back({{il, id}, buft});
}
} else {
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(s_l[il]->buffer);
buft_layers[buft].push_back({{il, -1}, buft});
}
}
for (auto & [buft, layers] : buft_layers) {
ggml_init_params params = {
layers.size() * 2 * ggml_tensor_overhead(),
NULL,
true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to create ggml context for per-step checkpoints\n", __func__);
return false;
}
for (auto & [p, bt] : layers) {
auto [il, id] = p;
if (id < 0) {
if (max_tokens > 1) {
GGML_ASSERT(ckpt.per_step_ssm[il].empty());
GGML_ASSERT(ckpt.per_step_conv[il].empty());
ggml_tensor * t_ssm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (int64_t)(max_tokens - 1) * ssm_state_dim);
ggml_format_name(t_ssm, "per_step_ssm_l%d", il);
ckpt.per_step_ssm[il].push_back(t_ssm);
}
ggml_tensor * t_qkv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (int64_t)max_tokens * conv_state_dim);
ggml_format_name(t_qkv, "per_step_qkv_l%d", il);
ckpt.per_step_conv[il].push_back(t_qkv);
} else {
auto split_sl = (ggml_split_tensor_t *)s_l[il]->extra;
auto split_ssm_out = (const ggml_split_tensor_t *)model.layers[il].ssm_out->extra;
GGML_ASSERT(split_ssm_out && split_ssm_out->splits[id]);
auto split = split_ssm_out->splits[id];
GGML_ASSERT(split->ne[0] % head_v_dim == 0);
if (ckpt.per_step_ssm[il].empty()) {
ckpt.per_step_ssm [il].resize(split_sl->n_device, nullptr);
ckpt.per_step_conv[il].resize(split_sl->n_device, nullptr);
} else {
GGML_ASSERT(int(ckpt.per_step_ssm[il].size()) == split_sl->n_device);
GGML_ASSERT(int(ckpt.per_step_conv[il].size()) == split_sl->n_device);
GGML_ASSERT(ckpt.per_step_ssm[il][id] == nullptr);
GGML_ASSERT(ckpt.per_step_conv[il][id] == nullptr);
}
int nv = split->ne[0] / head_v_dim; auto [this_conv_dim, this_ssm_dim] = model.hparams.n_embd_v_s_dims(nv);
if (max_tokens > 1) {
auto t_ssm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (int64_t)(max_tokens - 1) * this_ssm_dim);
ggml_format_name(t_ssm, "per_step_ssm_l%d_%d", il, id);
ckpt.per_step_ssm[il][id] = t_ssm;
}
auto t_qkv = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (int64_t)max_tokens * this_conv_dim * (d_conv - 1));
ggml_format_name(t_qkv, "per_step_qkv_l%d_%d", il, id);
ckpt.per_step_conv[il][id] = t_qkv;
}
}
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for per-step checkpoints\n", __func__);
ggml_free(ctx);
return false;
}
ggml_backend_buffer_clear(buf, 0);
LLAMA_LOG_INFO("%s: %10s per-step buffer = %8.2f MiB (max_tokens=%d)\n", __func__,
ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0, max_tokens);
ckpt.per_step_ctxs.push_back(ctx);
ckpt.per_step_bufs.push_back(buf);
}
ckpt.per_step_max_allocated = max_tokens;
return true;
}
static void restore_recurrent_cache_tensors(int step, ggml_backend_sched_t sched,
size_t ssm_bytes, size_t conv_bytes,
ggml_tensor * s_l, ggml_tensor * per_step_ssm, ggml_tensor * per_step_conv,
std::unordered_set<ggml_backend_t> & backends_to_sync) {
auto dst_backend = ggml_backend_sched_get_tensor_backend(sched, s_l);
auto dst = *s_l;
dst.ne[0] = ssm_bytes/sizeof(float);
dst.nb[1] = dst.nb[2] = dst.nb[3] = ssm_bytes + conv_bytes;
dst.data = (char *)s_l->data + conv_bytes;
auto src = dst;
src.data = (char *)per_step_ssm->data + (size_t)step * ssm_bytes;
ggml_backend_tensor_copy_async(dst_backend, dst_backend, &src, &dst);
backends_to_sync.insert(dst_backend);
dst.data = (char *)s_l->data;
dst.ne[0] = conv_bytes/sizeof(float);
src = dst;
src.data = (char *)per_step_conv->data + (size_t)step * conv_bytes;
ggml_backend_tensor_copy_async(dst_backend, dst_backend, &src, &dst);
}
bool llama_kv_cache::per_step_restore(const llama_model & model, ggml_backend_sched_t sched, int step) {
if (ckpt.per_step_ssm.empty() || step < 0) {
return false;
}
std::unordered_set<ggml_backend_t> backends_to_sync;
const int64_t ssm_state_dim = ckpt.per_step_ssm_state_size;
const int64_t conv_state_dim = ckpt.per_step_conv_state_dim;
const int64_t conv_dim = ckpt.per_step_conv_dim;
const int32_t d_conv = ckpt.per_step_d_conv;
if (ssm_state_dim <= 0 || conv_dim <= 0 || d_conv <= 1) return false;
const int64_t ssm_bytes = ssm_state_dim * sizeof(float);
const int64_t conv_bytes = conv_state_dim * sizeof(float);
int num_v_heads = model.hparams.ssm_dt_rank;
int head_v_dim = model.hparams.ssm_d_inner / num_v_heads;
const uint32_t n_layer = (uint32_t)s_l.size();
for (uint32_t il = 0; il < n_layer; ++il) {
if (s_l[il] == nullptr || ckpt.per_step_ssm[il].empty()) continue;
if (!s_l[il]->extra) {
restore_recurrent_cache_tensors(step, sched, ssm_bytes, conv_bytes,
s_l[il], ckpt.per_step_ssm[il].front(), ckpt.per_step_conv[il].front(),
backends_to_sync);
} else {
auto split_sl = (const ggml_split_tensor_t *)s_l[il]->extra;
for (int id = 0; id < split_sl->n_device; ++id) {
if (!split_sl->splits[id]) continue;
auto split_ssm_out = (const ggml_split_tensor_t *)model.layers[il].ssm_out->extra;
GGML_ASSERT(split_ssm_out && split_ssm_out->splits[id]);
auto split = split_ssm_out->splits[id];
GGML_ASSERT(split->ne[0] % head_v_dim == 0);
int nv = split->ne[0] / head_v_dim; auto [this_conv_dim, this_ssm_dim] = model.hparams.n_embd_v_s_dims(nv);
auto this_conv_bytes = (d_conv - 1) * this_conv_dim * sizeof(float);
restore_recurrent_cache_tensors(step, sched, this_ssm_dim * sizeof(float), this_conv_bytes,
split_sl->splits[id], ckpt.per_step_ssm[il][id], ckpt.per_step_conv[il][id],
backends_to_sync);
}
}
}
for (auto backend : backends_to_sync) {
ggml_backend_synchronize(backend);
}
return true;
}
static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
for (int32_t i = 0; i < (int32_t) cache.size; ++i) {
cache.cells[i].pos = -1;
cache.cells[i].src = i;
cache.cells[i].seq_id.clear();
}
cache.head = 0;
cache.used = 0;
for (auto & buf : cache.bufs) {
ggml_backend_buffer_clear(buf, 0);
}
}
static bool llama_kv_cache_seq_rm(
struct llama_kv_cache & cache,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1) {
uint32_t new_head = cache.size;
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
if (cache.recurrent) {
if (seq_id >= (int64_t) cache.size) {
return false;
}
if (0 <= seq_id) {
if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) {
return false;
}
} else {
if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
return false;
}
}
}
const bool has_qnext_state = llama_kv_has_qnext_state_storage(cache);
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
if (seq_id < 0) {
cache.cells[i].seq_id.clear();
} else if (cache.cells[i].has_seq_id(seq_id)) {
cache.cells[i].seq_id.erase(seq_id);
} else {
continue;
}
if (cache.cells[i].is_empty()) {
if (cache.cells[i].pos >= 0) cache.used--;
cache.cells[i].pos = -1;
if (has_qnext_state) {
cache.cells[i].src = i;
}
if (new_head == cache.size) new_head = i;
}
}
}
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
return true;
}
static void llama_kv_cache_seq_cp(
struct llama_kv_cache & cache,
llama_seq_id seq_id_src,
llama_seq_id seq_id_dst,
llama_pos p0,
llama_pos p1) {
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
if (cache.recurrent) {
if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
seq_id_src = cache.cells[seq_id_src].src;
GGML_ASSERT((uint32_t) seq_id_src < cache.size);
cache.cells[seq_id_dst].src = seq_id_src;
if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) {
cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
} else {
cache.cells[seq_id_dst].seq_id.erase(seq_id_dst);
}
cache.do_copy = true;
cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
}
return;
}
const bool has_qnext_state = llama_kv_has_qnext_state_storage(cache);
if (has_qnext_state &&
llama_kv_qnext_seq_id_in_range(cache, seq_id_dst) &&
llama_kv_qnext_seq_id_in_range(cache, seq_id_src) &&
(uint32_t) seq_id_dst < cache.size &&
(uint32_t) seq_id_src < cache.size) {
seq_id_src = cache.cells[seq_id_src].src;
GGML_ASSERT((uint32_t) seq_id_src < cache.size);
cache.cells[seq_id_dst].src = seq_id_src;
cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
cache.do_copy = true;
}
cache.head = 0;
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
cache.cells[i].seq_id.insert(seq_id_dst);
}
}
}
static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id seq_id) {
uint32_t new_head = cache.size;
const bool has_qnext_state = llama_kv_has_qnext_state_storage(cache);
for (uint32_t i = 0; i < cache.size; ++i) {
if (!cache.cells[i].has_seq_id(seq_id)) {
if (cache.cells[i].pos >= 0) cache.used--;
cache.cells[i].pos = -1;
if (has_qnext_state) {
cache.cells[i].src = i;
}
cache.cells[i].seq_id.clear();
if (new_head == cache.size) new_head = i;
} else {
cache.cells[i].seq_id.clear();
cache.cells[i].seq_id.insert(seq_id);
}
}
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
}
static void llama_kv_cache_seq_add(
struct llama_kv_cache & cache,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
llama_pos delta) {
uint32_t new_head = cache.size;
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
if (p0 == p1) return;
if (cache.recurrent) {
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
llama_kv_cell & cell = cache.cells[seq_id];
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
cell.pos += delta;
}
}
return;
}
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
cache.has_shift = true;
cache.cells[i].pos += delta;
cache.cells[i].delta += delta;
if (cache.cells[i].pos < 0) {
if (!cache.cells[i].is_empty()) {
cache.used--;
}
cache.cells[i].pos = -1;
cache.cells[i].seq_id.clear();
if (new_head == cache.size) {
new_head = i;
}
}
}
}
cache.head = new_head != cache.size ? new_head : 0;
}
static void llama_kv_cache_seq_div(
struct llama_kv_cache & cache,
llama_seq_id seq_id,
llama_pos p0,
llama_pos p1,
int d) {
if (p0 < 0) p0 = 0;
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
if (p0 == p1) return;
if (cache.recurrent) {
if (0 <= seq_id && seq_id < (int64_t) cache.size) {
llama_kv_cell & cell = cache.cells[seq_id];
if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
cell.pos /= d;
}
}
return;
}
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
cache.has_shift = true;
{
llama_pos p_old = cache.cells[i].pos;
cache.cells[i].pos /= d;
cache.cells[i].delta += cache.cells[i].pos - p_old;
}
}
}
}
static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
llama_pos result = 0;
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id)) {
result = std::max(result, cache.cells[i].pos);
}
}
return result;
}
static llama_pos llama_kv_cache_seq_pos_min(struct llama_kv_cache & cache, llama_seq_id seq_id) {
llama_pos result = -1;
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id)) {
result = cache.cells[i].pos;
break;
}
}
return result;
}
static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
cache.do_defrag = true;
}
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
return cparams.flash_attn ? 256u : 32u;
}
void llm_load_arch(llama_model_loader & ml, llama_model & model) {
model.arch = ml.get_arch();
if (model.arch == LLM_ARCH_UNKNOWN) {
throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
}
}
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
const auto & hparams = model.hparams;
const auto & vocab = model.vocab;
const char * rope_scaling_type = hparams.rope_scaling_type_name(hparams.rope_scaling_type_train);
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
bool is_var = false;
std::vector<uint32_t> v;
for (uint32_t i = 0; i < n; ++i) {
v.push_back(f(i));
if (v[i] != v[0]) {
is_var = true;
}
}
std::stringstream ss;
if (is_var) {
ss << "[";
for (uint32_t i = 0; i < n; ++i) {
ss << v[i];
if (i < n - 1) {
ss << ", ";
}
}
ss << "]";
} else {
ss << v[0];
}
return ss.str();
};
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llama_model_arch_name(model.arch));
if (!hparams.vocab_only) {
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
LLAMA_LOG_INFO("%s: n_swa_pattern = %u\n", __func__, hparams.n_swa_pattern);
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k(0));
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v(0));
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
}
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
}
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
if (ml.n_elements >= 1e12) {
LLAMA_LOG_INFO("%s: model params = %.3f T\n", __func__, ml.n_elements*1e-12);
} else if (ml.n_elements >= 1e9) {
LLAMA_LOG_INFO("%s: model params = %.3f B\n", __func__, ml.n_elements*1e-9);
} else if (ml.n_elements >= 1e6) {
LLAMA_LOG_INFO("%s: model params = %.3f M\n", __func__, ml.n_elements*1e-6);
} else {
LLAMA_LOG_INFO("%s: model params = %.3f K\n", __func__, ml.n_elements*1e-3);
}
if (ml.n_bytes < GiB) {
LLAMA_LOG_INFO("%s: model size = %.3f MiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
} else {
LLAMA_LOG_INFO("%s: model size = %.3f GiB (%.3f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
}
{
auto n_bytes = ml.n_bytes;
auto n_elements = ml.n_elements;
auto meta_tke = ml.get_tensor_meta("token_embd.weight");
auto meta_out = ml.get_tensor_meta("output.weight");
if (meta_tke && meta_out) {
n_bytes -= ggml_nbytes(meta_tke);
n_elements -= ggml_nelements(meta_tke);
n_bytes -= ggml_nbytes(meta_out);
n_elements -= ggml_nelements(meta_out);
if (n_bytes < GiB) {
LLAMA_LOG_INFO("%s: repeating layers = %.3f MiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0, n_bytes*8.0/n_elements);
} else {
LLAMA_LOG_INFO("%s: repeating layers = %.3f GiB (%.3f BPW", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
}
if (ml.n_elements >= 1e9) {
LLAMA_LOG_INFO(", %.3f B parameters)\n", n_elements*1e-9);
} else {
LLAMA_LOG_INFO(", %.3f M parameters)\n", n_elements*1e-6);
}
}
}
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
if (model.is_mla_model()) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llm_expert_gating_func_type) hparams.expert_gating_func));
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
}
if (model.arch == LLM_ARCH_QWEN2MOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
}
if (model.arch == LLM_ARCH_QWEN3MOE || model.arch == LLM_ARCH_OPENAI_MOE || model.arch == LLM_ARCH_QWEN3VLMOE) {
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
}
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
}
if (model.arch == LLM_ARCH_BAILINGMOE2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llm_expert_gating_func_type) hparams.expert_gating_func));
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
}
vocab.print_info();
}
static void llm_requantize_output_tensor(llama_model & model, ggml_type new_type) {
if (new_type == GGML_TYPE_COUNT || !model.output) return;
if (model.output_mtp && model.output_mtp != model.output) {
LLAMA_LOG_WARN("%s: MTP output tensor is already present => not requantizing\n", __func__);
return;
}
if (model.output->type == new_type) {
LLAMA_LOG_WARN("%s: output tensor is already of type %s => not requantizing\n", __func__, ggml_type_name(new_type));
}
auto [other_type, n_interleaved] = interleaved_properties(new_type);
if (model.output->ne[1] % n_interleaved != 0) {
LLAMA_LOG_WARN("%s: number of rows %ld is not a multiple of %d row interleaving for %s\n", __func__,
model.output->ne[1], n_interleaved, ggml_type_name(new_type));
LLAMA_LOG_WARN("%s: using %s instead of %s\n", __func__, ggml_type_name(other_type), ggml_type_name(new_type));
new_type = other_type;
n_interleaved = 1;
}
auto nbytes_orig = ggml_nbytes(model.output);
auto row_size = ggml_row_size(new_type, model.output->ne[0]);
auto nbytes_new = row_size*ggml_nrows(model.output);
if (nbytes_new >= nbytes_orig) {
LLAMA_LOG_WARN("%s: if requantized to %s the output tensor size would be %zu, which is >= the current size %zu => not requantizing\n", __func__, ggml_type_name(new_type), nbytes_new, nbytes_orig);
return;
}
LLAMA_LOG_INFO("====== Creating extra output tensor of type %s for MTP usage. Additional memory required is %.2f MiB\n",
ggml_type_name(new_type), nbytes_new/1024./1024.);
bool is_host = ggml_backend_buffer_is_host(model.output->buffer);
auto tensor_data = model.output->data;
std::vector<char> tensor_data_buf;
if (!is_host) {
tensor_data_buf.resize(nbytes_orig);
ggml_backend_tensor_get(model.output, tensor_data_buf.data(), 0, nbytes_orig);
tensor_data = tensor_data_buf.data();
}
auto tt_new = ggml_internal_get_type_traits(new_type);
auto new_output = std::make_unique<ggml_tensor>(*model.output);
new_output->type = new_type;
new_output->nb[0] = tt_new.type_size;
new_output->nb[1] = row_size;
new_output->nb[2] = new_output->nb[1] * new_output->ne[1];
new_output->nb[3] = new_output->nb[2] * new_output->ne[2];
GGML_ASSERT(ggml_nbytes(new_output.get()) == nbytes_new);
new_output->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(model.output->buffer), nbytes_new);
new_output->data = ggml_backend_buffer_get_base(new_output->buffer);
new_output->op = GGML_OP_NONE;
for (int j = 0; j < GGML_MAX_SRC; ++j) new_output->src[j] = nullptr;
ggml_set_name(new_output.get(), "output_extra.weight");
ggml_backend_buffer_set_usage(new_output->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
std::vector<char> new_data_buf;
char * new_data = (char *)new_output->data;
if (!is_host) {
new_data_buf.resize(nbytes_new);
new_data = new_data_buf.data();
}
int nthread = std::max<int>(1, std::thread::hardware_concurrency()/2);
auto compute = [t = model.output, tensor_data, new_data, nthread, new_type, n_interleaved] (int ith) {
std::vector<float> work(t->ne[0]*n_interleaved);
auto tt_orig = ggml_internal_get_type_traits(t->type);
auto tt_new = ggml_internal_get_type_traits(new_type);
iqk_quantize_any(int(t->type), int(new_type),
t->ne[0], t->ne[1], t->ne[2], t->ne[3],
t->nb[0], t->nb[1], t->nb[2], t->nb[3],
tensor_data, new_data, work.data(), tt_orig.to_float, tt_new.from_float, ith, nthread);
};
std::vector<std::thread> workers(nthread-1);
for (int it = 0; it < nthread-1; ++it) workers[it] = std::thread(compute, it);
compute(nthread-1);
for (auto & w : workers) w.join();
if (!is_host) {
ggml_backend_tensor_set(new_output.get(), new_data, 0, nbytes_new);
}
model.output_mtp_ptr = std::move(new_output);
model.output_mtp = model.output_mtp_ptr.get();
}
static void llm_prepare_mla(llama_model & model, int mla) {
if (!model.is_mla_model()) return;
const auto& hparams = model.hparams;
const int n_layer = model.layers.size();
int n_to_compute = 0;
for (auto& l : model.layers) {
if (!l.wk_b) ++n_to_compute;
}
if (mla > 0 && n_to_compute > 0) {
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k(0) - hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
const int32_t n_embd_head_v = hparams.n_embd_head_v(0);
const int32_t n_head = hparams.n_head(0);
std::vector<uint8_t> work_data;
LLAMA_LOG_INFO("============ %s: need to compute %d wk_b/wv_b tensors\n", __func__, n_to_compute);
for (int il = 1; il < n_layer; ++il) {
if ((int)hparams.n_head(il) != n_head) throw std::runtime_error("Unsupported configuration");
}
size_t max_wkv_size = 0;
size_t max_wk_size = 0;
for (auto& l : model.layers) {
if (!l.wk_b) {
auto new_type = ggml_is_quantized(l.wkv_b->type) ? GGML_TYPE_Q8_0 : l.wkv_b->type;
auto size = ggml_row_size(new_type, n_embd_head_qk_nope)*kv_lora_rank*n_head;
max_wk_size = std::max(max_wk_size, size);
if (!ggml_backend_buffer_is_host(l.wkv_b->buffer)) {
max_wkv_size = std::max(max_wkv_size, ggml_nbytes(l.wkv_b));
}
}
}
auto context_size = 2*max_wk_size + 2*n_embd_head_qk_nope*kv_lora_rank*n_head*sizeof(float);
context_size *= 2; std::vector<uint8_t> wkv_buffer;
if (max_wkv_size > 0) wkv_buffer.resize(max_wkv_size);
ggml_init_params params{context_size, nullptr, true};
auto ctx = ggml_init(params);
auto graph = ggml_new_graph_custom(ctx, 8, false);
std::vector<uint8_t> tensor_data(2*n_embd_head_qk_nope*kv_lora_rank*n_head*sizeof(float) + 2*max_wk_size);
for (int il = 0; il < n_layer; ++il) {
auto& l = model.layers[il];
if (l.wk_b) continue;
auto wkv_b = *l.wkv_b;
if (!ggml_backend_buffer_is_host(l.wkv_b->buffer)) {
ggml_backend_tensor_get(l.wkv_b, wkv_buffer.data(), 0, ggml_nbytes(l.wkv_b));
wkv_b.data = wkv_buffer.data();
}
auto wk_b_view = ggml_view_3d(ctx, &wkv_b, kv_lora_rank, n_embd_head_qk_nope, n_head,
l.wkv_b->nb[1], l.wkv_b->nb[1]*(n_embd_head_qk_nope + n_embd_head_v), 0);
auto wk_b_f32 = ggml_cast(ctx, wk_b_view, GGML_TYPE_F32);
wk_b_f32->data = tensor_data.data();
auto wk_b_f32_tview = ggml_transpose(ctx, wk_b_f32);
auto wk_b_f32_t = ggml_cont(ctx, wk_b_f32_tview);
wk_b_f32_t->data = (char *)wk_b_f32->data + ggml_nbytes(wk_b_f32);
auto new_type = ggml_is_quantized(wkv_b.type) ?
wkv_b.type >= GGML_TYPE_Q4_0_R8 && wkv_b.type <= GGML_TYPE_Q8_K_R8 ? GGML_TYPE_Q8_0_R8 : GGML_TYPE_Q8_0 : wkv_b.type;
auto wk_b = ggml_cast(ctx, wk_b_f32_t, new_type);
wk_b->data = (char *)wk_b_f32_t->data + ggml_nbytes(wk_b_f32_t);
ggml_build_forward_expand(graph, wk_b);
auto plan = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
if (plan.work_size > work_data.size()) work_data.resize(plan.work_size);
plan.work_data = work_data.data();
auto status = ggml_graph_compute(graph, &plan);
if (status != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute wk_b");
auto name = std::string{"blk."} + std::to_string(il) + ".attn_k_b.weight";
const bool tp_replicate =
(model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN)
&& l.wo && l.wo->extra;
auto materialize = [&](ggml_tensor * source,
std::unique_ptr<ggml_tensor> & computed,
std::vector<std::unique_ptr<ggml_tensor>> & replicas,
llama_split_tensor & split,
const std::string & tname) -> ggml_tensor * {
if (tp_replicate) {
auto wo_split = (const ggml_split_tensor_t *)l.wo->extra;
const int n_device = wo_split->n_device;
const int64_t n_embd_head_v_full = hparams.n_embd_head_v_full;
std::vector<int> head_offsets(n_device + 1, 0);
for (int idx = 0; idx < n_device; ++idx) {
int n_h_id = 0;
if (wo_split->splits[idx]) {
n_h_id = (int)(wo_split->splits[idx]->ne[0] / n_embd_head_v_full);
}
head_offsets[idx + 1] = head_offsets[idx] + n_h_id;
}
computed = std::make_unique<ggml_tensor>(*source);
computed->buffer = nullptr;
computed->data = nullptr;
computed->op = GGML_OP_NONE;
for (int j = 0; j < GGML_MAX_SRC; ++j) computed->src[j] = nullptr;
ggml_set_name(computed.get(), tname.c_str());
replicas.resize(n_device);
split.tensor_splits.assign(n_device, nullptr);
const size_t head_block_bytes = source->nb[2];
for (int id = 0; id < n_device; ++id) {
if (!wo_split->splits[id] || !wo_split->splits[id]->buffer) continue;
const int head_offset = head_offsets[id];
const int n_head_local = head_offsets[id + 1] - head_offset;
if (n_head_local <= 0) continue;
const size_t slice_bytes = (size_t)n_head_local * head_block_bytes;
auto dev_buft = ggml_backend_buffer_get_type(wo_split->splits[id]->buffer);
auto dev_buf = ggml_backend_buft_alloc_buffer(dev_buft, slice_bytes);
if (!dev_buf) {
throw std::runtime_error("Failed to allocate per-rank buffer for " + tname);
}
ggml_backend_buffer_set_usage(dev_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.bufs.push_back(dev_buf);
replicas[id] = std::make_unique<ggml_tensor>(*source);
auto rep = replicas[id].get();
rep->ne[2] = n_head_local;
rep->nb[3] = rep->nb[2] * (size_t)rep->ne[2];
rep->buffer = dev_buf;
rep->data = ggml_backend_buffer_get_base(dev_buf);
rep->op = GGML_OP_NONE;
for (int j = 0; j < GGML_MAX_SRC; ++j) rep->src[j] = nullptr;
rep->view_src = nullptr;
rep->view_offs = 0;
rep->extra = nullptr;
ggml_set_name(rep, (tname + "." + std::to_string(id)).c_str());
const uint8_t * src_bytes = (const uint8_t *)source->data + (size_t)head_offset * head_block_bytes;
ggml_backend_tensor_set(rep, src_bytes, 0, slice_bytes);
if (ggml_backend_buffer_is_host(rep->buffer)) {
iqk_modify_tensor(rep);
}
split.tensor_splits[id] = rep;
}
split.ggml.n_device = n_device;
split.ggml.split_dim = 2;
split.ggml.splits = split.tensor_splits.data();
computed->extra = (void *)&split.ggml;
LLAMA_LOG_INFO("Computed %s as %d x %d x %d of type %s, split across %d devices on dim=2\n",
tname.c_str(), (int)source->ne[0], (int)source->ne[1], (int)source->ne[2],
ggml_type_name(source->type), n_device);
} else {
computed = std::make_unique<ggml_tensor>(*source);
computed->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(l.wkv_b->buffer), ggml_nbytes(source));
computed->data = ggml_backend_buffer_get_base(computed->buffer);
computed->op = GGML_OP_NONE;
for (int j = 0; j < GGML_MAX_SRC; ++j) computed->src[j] = nullptr;
ggml_set_name(computed.get(), tname.c_str());
ggml_backend_buffer_set_usage(computed->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
ggml_backend_tensor_set(computed.get(), source->data, 0, ggml_nbytes(source));
if (ggml_backend_buffer_is_host(computed->buffer)) {
iqk_modify_tensor(computed.get());
}
LLAMA_LOG_INFO("Computed %s as %d x %d x %d of type %s and stored in buffer %s\n",
tname.c_str(), (int)source->ne[0], (int)source->ne[1], (int)source->ne[2],
ggml_type_name(source->type), ggml_backend_buffer_name(computed->buffer));
}
model.tensors_by_name.push_back(std::make_pair(tname, computed.get()));
return computed.get();
};
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) {
ggml_graph_clear(graph);
auto wk_b_pp = ggml_cast(ctx, wk_b_f32, new_type);
wk_b_pp->data = (char *)wk_b->data + ggml_nbytes(wk_b);
GGML_ASSERT((char *)wk_b_pp->data + ggml_nbytes(wk_b_pp) <=
(char *)tensor_data.data() + tensor_data.size());
ggml_build_forward_expand(graph, wk_b_pp);
auto plan_pp = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
if (plan_pp.work_size > work_data.size()) work_data.resize(plan_pp.work_size);
plan_pp.work_data = work_data.data();
auto status_pp = ggml_graph_compute(graph, &plan_pp);
if (status_pp != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute attn_kv_b");
auto name_pp = std::string{"blk."} + std::to_string(il) + ".attn_kv_b.weight";
l.wk_b_pp = materialize(wk_b_pp, l.computed_wk_b_pp, l.computed_wk_b_pp_replicas, l.split_wk_b_pp, name_pp);
ggml_graph_clear(graph);
}
l.wk_b = materialize(wk_b, l.computed_wk_b, l.computed_wk_b_replicas, l.split_wk_b, name);
ggml_graph_clear(graph);
auto wv_b = ggml_cont(ctx, ggml_view_3d(ctx, &wkv_b, kv_lora_rank, n_embd_head_v, n_head,
l.wkv_b->nb[1], l.wkv_b->nb[1]*(n_embd_head_qk_nope + n_embd_head_v), l.wkv_b->nb[1]*n_embd_head_qk_nope));
wv_b->data = tensor_data.data();
ggml_build_forward_expand(graph, wv_b);
plan = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
if (plan.work_size > work_data.size()) work_data.resize(plan.work_size);
plan.work_data = work_data.data();
status = ggml_graph_compute(graph, &plan);
if (status != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute wv_b");
name = std::string{"blk."} + std::to_string(il) + ".attn_v_b.weight";
l.wv_b = materialize(wv_b, l.computed_wv_b, l.computed_wv_b_replicas, l.split_wv_b, name);
ggml_graph_clear(graph);
}
ggml_free(ctx);
}
int n_computed = 0;
if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && mla > 1) {
int n_pp_to_compute = 0;
for (auto & l : model.layers) {
if (l.wk_b && !l.wk_b_pp) ++n_pp_to_compute;
}
if (n_pp_to_compute > 0) {
const uint32_t n_embd_head_qk_nope_pp = hparams.n_embd_head_k(0) - hparams.n_rot;
const uint32_t kv_lora_rank_pp = hparams.n_lora_kv;
const int32_t n_head_pp = hparams.n_head(0);
size_t max_wk_size_pp = 0;
for (auto & l : model.layers) {
if (l.wk_b && !l.wk_b_pp) {
max_wk_size_pp = std::max(max_wk_size_pp, ggml_nbytes(l.wk_b));
}
}
auto context_size_pp = 4 * max_wk_size_pp +
4 * (size_t)n_embd_head_qk_nope_pp * kv_lora_rank_pp * n_head_pp * sizeof(float);
context_size_pp *= 2;
std::vector<uint8_t> work_data_pp;
std::vector<uint8_t> full_wk_b_host_buf(max_wk_size_pp);
std::vector<uint8_t> tensor_data_pp(
2 * (size_t)n_embd_head_qk_nope_pp * kv_lora_rank_pp * n_head_pp * sizeof(float) +
max_wk_size_pp);
ggml_init_params params_pp{context_size_pp, nullptr, true};
auto ctx_pp = ggml_init(params_pp);
auto graph_pp = ggml_new_graph_custom(ctx_pp, 8, false);
LLAMA_LOG_INFO("============ %s: need to compute %d attn_kv_b tensors\n", __func__, n_pp_to_compute);
for (int il = 0; il < n_layer; ++il) {
auto & l = model.layers[il];
if (!l.wk_b || l.wk_b_pp) continue;
if (!l.wo || !l.wo->extra || !l.wk_b->extra) continue;
++n_computed;
auto wk_b_split = (const ggml_split_tensor_t *)l.wk_b->extra;
auto wo_split = (const ggml_split_tensor_t *)l.wo->extra;
const int n_device = wo_split->n_device;
auto name = std::string{"blk."} + std::to_string(il) + ".attn_kv_b.weight";
ggml_tensor wk_b_pp_template = *l.wk_b;
wk_b_pp_template.ne[0] = (int64_t)kv_lora_rank_pp;
wk_b_pp_template.ne[1] = (int64_t)n_embd_head_qk_nope_pp;
wk_b_pp_template.ne[2] = (int64_t)n_head_pp;
wk_b_pp_template.nb[0] = ggml_type_size(l.wk_b->type);
wk_b_pp_template.nb[1] = wk_b_pp_template.nb[0] * (wk_b_pp_template.ne[0] / ggml_blck_size(l.wk_b->type));
wk_b_pp_template.nb[2] = wk_b_pp_template.nb[1] * wk_b_pp_template.ne[1];
wk_b_pp_template.nb[3] = wk_b_pp_template.nb[2] * wk_b_pp_template.ne[2];
l.computed_wk_b_pp = std::make_unique<ggml_tensor>(wk_b_pp_template);
l.computed_wk_b_pp->buffer = nullptr;
l.computed_wk_b_pp->data = nullptr;
l.computed_wk_b_pp->op = GGML_OP_NONE;
for (int j = 0; j < GGML_MAX_SRC; ++j) l.computed_wk_b_pp->src[j] = nullptr;
ggml_set_name(l.computed_wk_b_pp.get(), name.c_str());
l.computed_wk_b_pp_replicas.resize(n_device);
l.split_wk_b_pp.tensor_splits.assign(n_device, nullptr);
const size_t per_head_pp_bytes = wk_b_pp_template.nb[2];
std::vector<int> head_offsets(n_device + 1, 0);
for (int id = 0; id < n_device; ++id) {
int n_h_id = 0;
if (wk_b_split->splits[id]) {
n_h_id = (int)wk_b_split->splits[id]->ne[2];
}
head_offsets[id + 1] = head_offsets[id] + n_h_id;
}
const size_t per_head_in_bytes = l.wk_b->nb[2];
const size_t full_wk_b_nbytes = (size_t)head_offsets[n_device] * per_head_in_bytes;
GGML_ASSERT(full_wk_b_nbytes <= full_wk_b_host_buf.size());
for (int id = 0; id < n_device; ++id) {
if (!wk_b_split->splits[id]) continue;
auto wk_b_rank = wk_b_split->splits[id];
const size_t rank_nbytes = ggml_nbytes(wk_b_rank);
const size_t byte_offset = (size_t)head_offsets[id] * per_head_in_bytes;
GGML_ASSERT(byte_offset + rank_nbytes <= full_wk_b_nbytes);
ggml_backend_tensor_get(wk_b_rank, full_wk_b_host_buf.data() + byte_offset, 0, rank_nbytes);
}
ggml_tensor full_host = *l.wk_b;
full_host.data = full_wk_b_host_buf.data();
auto f_f32 = ggml_cast(ctx_pp, &full_host, GGML_TYPE_F32);
f_f32->data = tensor_data_pp.data();
auto f_view = ggml_transpose(ctx_pp, f_f32);
auto f_cont = ggml_cont(ctx_pp, f_view);
f_cont->data = (char *)f_f32->data + ggml_nbytes(f_f32);
auto f_q = ggml_cast(ctx_pp, f_cont, l.wk_b->type);
f_q->data = (char *)f_cont->data + ggml_nbytes(f_cont);
ggml_build_forward_expand(graph_pp, f_q);
auto plan = ggml_graph_plan(graph_pp, std::thread::hardware_concurrency()/2);
if (plan.work_size > work_data_pp.size()) work_data_pp.resize(plan.work_size);
plan.work_data = work_data_pp.data();
auto status = ggml_graph_compute(graph_pp, &plan);
if (status != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute attn_kv_b");
ggml_graph_clear(graph_pp);
for (int id = 0; id < n_device; ++id) {
if (!wo_split->splits[id] || !wo_split->splits[id]->buffer) continue;
if (!wk_b_split->splits[id]) continue;
const int n_head_local = (int)wk_b_split->splits[id]->ne[2];
if (n_head_local <= 0) continue;
const size_t slice_bytes = (size_t)n_head_local * per_head_pp_bytes;
auto dev_buft = ggml_backend_buffer_get_type(wo_split->splits[id]->buffer);
auto dev_buf = ggml_backend_buft_alloc_buffer(dev_buft, slice_bytes);
if (!dev_buf) {
throw std::runtime_error("Failed to allocate per-rank buffer for " + name);
}
ggml_backend_buffer_set_usage(dev_buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
model.bufs.push_back(dev_buf);
l.computed_wk_b_pp_replicas[id] = std::make_unique<ggml_tensor>(wk_b_pp_template);
auto rep = l.computed_wk_b_pp_replicas[id].get();
rep->ne[2] = n_head_local;
rep->nb[3] = rep->nb[2] * (size_t)rep->ne[2];
rep->buffer = dev_buf;
rep->data = ggml_backend_buffer_get_base(dev_buf);
rep->op = GGML_OP_NONE;
for (int j = 0; j < GGML_MAX_SRC; ++j) rep->src[j] = nullptr;
rep->view_src = nullptr;
rep->view_offs = 0;
rep->extra = nullptr;
ggml_set_name(rep, (name + "." + std::to_string(id)).c_str());
const size_t byte_offset = (size_t)head_offsets[id] * per_head_pp_bytes;
ggml_backend_tensor_set(rep, (char *)f_q->data + byte_offset, 0, slice_bytes);
if (ggml_backend_buffer_is_host(rep->buffer)) {
iqk_modify_tensor(rep);
}
l.split_wk_b_pp.tensor_splits[id] = rep;
}
l.split_wk_b_pp.ggml.n_device = n_device;
l.split_wk_b_pp.ggml.split_dim = 2;
l.split_wk_b_pp.ggml.splits = l.split_wk_b_pp.tensor_splits.data();
l.computed_wk_b_pp->extra = (void *)&l.split_wk_b_pp.ggml;
model.tensors_by_name.push_back(std::make_pair(name, l.computed_wk_b_pp.get()));
l.wk_b_pp = l.computed_wk_b_pp.get();
LLAMA_LOG_INFO("Computed %s as %d x %d x %d of type %s, split across %d devices on dim=2\n",
name.c_str(),
(int)l.computed_wk_b_pp->ne[0],
(int)l.computed_wk_b_pp->ne[1],
(int)l.computed_wk_b_pp->ne[2],
ggml_type_name(l.computed_wk_b_pp->type), n_device);
}
ggml_free(ctx_pp);
}
}
if (mla == 1 || (model.split_mode == LLAMA_SPLIT_MODE_GRAPH && n_computed == n_layer)) return;
n_to_compute = 0;
for (auto& l : model.layers) {
if (l.wk_b && l.wv_b && !l.wkv_b) ++n_to_compute;
}
if (n_to_compute == 0) return;
const int32_t n_head = hparams.n_head(0);
std::vector<uint8_t> work_data;
LLAMA_LOG_INFO("============ %s: need to compute %d wkv_b tensors\n", __func__, n_to_compute);
for (int il = 1; il < n_layer; ++il) {
if ((int)hparams.n_head(il) != n_head) throw std::runtime_error("Unsupported configuration");
}
size_t context_size = ggml_tensor_overhead()*16*n_layer;
ggml_init_params params{context_size, nullptr, true};
auto ctx = ggml_init(params);
auto graph = ggml_new_graph_custom(ctx, 8, false);
std::vector<char> wk_buffer, wv_buffer;
std::vector<char> tmp_buffer;
for (int il = 0; il < n_layer; ++il) {
auto& l = model.layers[il];
if (l.wkv_b || !l.wk_b || !l.wv_b || (l.wo && l.wo->extra)) continue;
auto wk_b = *l.wk_b;
auto wv_b = *l.wv_b;
if (!ggml_backend_buffer_is_host(l.wk_b->buffer)) {
auto nbytes = ggml_nbytes(l.wk_b);
if (wk_buffer.size() < nbytes) wk_buffer.resize(nbytes);
ggml_backend_tensor_get(l.wk_b, wk_buffer.data(), 0, nbytes);
wk_b.data = wk_buffer.data();
}
if (!ggml_backend_buffer_is_host(l.wv_b->buffer)) {
auto nbytes = ggml_nbytes(l.wv_b);
if (wv_buffer.size() < nbytes) wv_buffer.resize(nbytes);
ggml_backend_tensor_get(l.wv_b, wv_buffer.data(), 0, nbytes);
wv_b.data = wv_buffer.data();
}
auto n_wk = ggml_nelements(&wk_b);
auto n_wv = ggml_nelements(&wv_b);
size_t tot_size = 0;
if (wk_b.type != GGML_TYPE_F32) {
tot_size += n_wk*sizeof(float);
}
tot_size += n_wk*sizeof(float); if (wv_b.type != GGML_TYPE_F32) {
tot_size += n_wv*sizeof(float);
}
tot_size += (n_wk + n_wv)*sizeof(float); tot_size += (n_wk + n_wv)*sizeof(float);
if (tmp_buffer.size() < tot_size) tmp_buffer.resize(tot_size);
auto ptr = tmp_buffer.data();
auto wk_b_used = &wk_b;
if (wk_b.type != GGML_TYPE_F32) {
wk_b_used = ggml_cast(ctx, &wk_b, GGML_TYPE_F32);
wk_b_used->data = ptr;
ptr += ggml_nbytes(wk_b_used);
}
auto wk_b_transposed = ggml_cont(ctx, ggml_transpose(ctx, wk_b_used));
wk_b_transposed->data = ptr;
ptr += ggml_nbytes(wk_b_transposed);
auto wv_b_used = &wv_b;
if (wv_b.type != GGML_TYPE_F32) {
wv_b_used = ggml_cast(ctx, &wv_b, GGML_TYPE_F32);
wv_b_used->data = ptr;
ptr += ggml_nbytes(wv_b_used);
}
auto wkv_b_f32_3d = ggml_concat(ctx, wk_b_transposed, wv_b_used, 1);
wkv_b_f32_3d->data = ptr;
ptr += ggml_nbytes(wkv_b_f32_3d);
auto wkv_b_f32 = ggml_view_2d(ctx, wkv_b_f32_3d, wkv_b_f32_3d->ne[0], wkv_b_f32_3d->ne[1]*wkv_b_f32_3d->ne[2],
wkv_b_f32_3d->nb[1], 0);
auto new_type = wk_b.type == GGML_TYPE_BF16 && wv_b.type == GGML_TYPE_BF16 ? GGML_TYPE_BF16
: wk_b.type == GGML_TYPE_F16 && wv_b.type == GGML_TYPE_F16 ? GGML_TYPE_F16
: GGML_TYPE_Q8_0;
auto wkv_b = ggml_cast(ctx, wkv_b_f32, new_type);
wkv_b->data = ptr;
ptr += ggml_nbytes(wkv_b);
ggml_build_forward_expand(graph, wkv_b);
auto plan = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
if (plan.work_size > work_data.size()) work_data.resize(plan.work_size);
plan.work_data = work_data.data();
auto status = ggml_graph_compute(graph, &plan);
if (status != GGML_STATUS_SUCCESS) throw std::runtime_error("Failed to compute wkv_b");
auto name = std::string{"blk."} + std::to_string(il) + ".attn_kv_b.weight";
l.computed_wkv_b = std::make_unique<ggml_tensor>(*wkv_b);
l.computed_wkv_b->buffer = ggml_backend_buft_alloc_buffer(ggml_backend_buffer_get_type(l.wk_b->buffer), ggml_nbytes(wkv_b));
l.computed_wkv_b->data = ggml_backend_buffer_get_base(l.computed_wkv_b->buffer);
l.computed_wkv_b->op = GGML_OP_NONE; for (int j = 0; j < GGML_MAX_SRC; ++j) l.computed_wkv_b->src[j] = nullptr;
ggml_set_name(l.computed_wkv_b.get(), name.c_str());
ggml_backend_buffer_set_usage(l.computed_wkv_b->buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
ggml_backend_tensor_set(l.computed_wkv_b.get(), wkv_b->data, 0, ggml_nbytes(wkv_b));
if (ggml_backend_buffer_is_host(l.computed_wkv_b->buffer)) {
iqk_modify_tensor(l.computed_wkv_b.get());
}
l.wkv_b = l.computed_wkv_b.get();
model.tensors_by_name.push_back(std::make_pair(name, l.wkv_b));
LLAMA_LOG_INFO("Computed %s as %d x %d of type %s and stored in buffer %s\n", name.c_str(), (int)wkv_b->ne[0], (int)wkv_b->ne[1],
ggml_type_name(wkv_b->type), ggml_backend_buffer_name(l.computed_wkv_b->buffer));
ggml_graph_clear(graph);
}
ggml_free(ctx);
}
static void llm_apply_khad_pretransform(llama_model & model) {
if (model.khad_pretransformed) return;
if (!model.is_mla_model()) return;
auto castable = [](ggml_type t) {
return t == GGML_TYPE_F32 || t == GGML_TYPE_F16 || t == GGML_TYPE_BF16 ||
t == GGML_TYPE_Q4_0 || t == GGML_TYPE_Q4_1 ||
t == GGML_TYPE_Q5_0 || t == GGML_TYPE_Q5_1 ||
t == GGML_TYPE_Q6_0 || t == GGML_TYPE_Q8_0 ||
t == GGML_TYPE_IQ4_NL ||
t == GGML_TYPE_Q4_K || t == GGML_TYPE_Q5_K || t == GGML_TYPE_Q6_K ||
t == GGML_TYPE_IQ4_K || t == GGML_TYPE_IQ5_K ||
t == GGML_TYPE_IQ4_KS|| t == GGML_TYPE_IQ4_KSS||
t == GGML_TYPE_IQ5_KS;
};
const auto & hparams = model.hparams;
const int64_t kv_lora_rank = hparams.n_lora_kv;
if (kv_lora_rank <= 0 || kv_lora_rank % 64 != 0) return;
bool all_eligible = true;
int n_layers_with_pp = 0;
for (auto & l : model.layers) {
if (!l.wv_b || !l.wk_b_pp) continue;
++n_layers_with_pp;
ggml_type tv = l.wv_b->type;
ggml_type tk = l.wk_b_pp->type;
if (!castable(tv) || !castable(tk)) {
all_eligible = false;
break;
}
}
if (!all_eligible || n_layers_with_pp == 0) {
LLAMA_LOG_INFO("============ %s: skipping (no eligible wv_b/wk_b_pp; n_pp=%d)\n",
__func__, n_layers_with_pp);
return;
}
auto fold_tensor = [&](ggml_tensor * t) -> bool {
if (!t) return true;
if (t->ne[0] != kv_lora_rank) return false;
const size_t nbytes = ggml_nbytes(t);
std::vector<uint8_t> host_in(nbytes);
ggml_backend_tensor_get(t, host_in.data(), 0, nbytes);
ggml_init_params ip{ ggml_tensor_overhead()*32 + ggml_graph_overhead(), nullptr, true };
auto ctx = ggml_init(ip);
auto graph = ggml_new_graph(ctx);
ggml_tensor src_host = *t;
src_host.data = host_in.data();
src_host.op = GGML_OP_NONE;
for (int j = 0; j < GGML_MAX_SRC; ++j) src_host.src[j] = nullptr;
src_host.buffer = nullptr;
src_host.extra = nullptr;
src_host.view_src = nullptr;
src_host.view_offs = 0;
const size_t n_f32_bytes = (size_t)ggml_nelements(t) * sizeof(float);
std::vector<uint8_t> f32_buf_a(n_f32_bytes);
std::vector<uint8_t> f32_buf_b(n_f32_bytes);
std::vector<uint8_t> out_buf(nbytes);
auto src_f32 = ggml_cast(ctx, &src_host, GGML_TYPE_F32);
src_f32->data = f32_buf_a.data();
auto had = ggml_hadamard(ctx, src_f32, 64);
had->data = f32_buf_b.data();
auto out_q = ggml_cast(ctx, had, t->type);
out_q->data = out_buf.data();
ggml_build_forward_expand(graph, out_q);
std::vector<uint8_t> work_data;
auto plan = ggml_graph_plan(graph, std::thread::hardware_concurrency()/2);
if (plan.work_size > work_data.size()) work_data.resize(plan.work_size);
plan.work_data = work_data.data();
bool ok = (ggml_graph_compute(graph, &plan) == GGML_STATUS_SUCCESS);
ggml_free(ctx);
if (!ok) return false;
ggml_backend_tensor_set(t, out_buf.data(), 0, nbytes);
return true;
};
auto fold_split_or_single = [&](ggml_tensor * full) -> bool {
if (!full) return true;
if (full->extra) {
auto split = (const ggml_split_tensor_t *)full->extra;
for (int id = 0; id < split->n_device; ++id) {
if (!split->splits[id]) continue;
if (!fold_tensor(split->splits[id])) return false;
}
return true;
}
return fold_tensor(full);
};
int n_folded = 0;
for (auto & l : model.layers) {
if (!l.wv_b || !l.wk_b_pp) continue;
if (!fold_split_or_single(l.wv_b)) { LLAMA_LOG_ERROR("%s: failed to fold wv_b\n", __func__); return; }
if (!fold_split_or_single(l.wk_b_pp)){ LLAMA_LOG_ERROR("%s: failed to fold wk_b_pp\n",__func__); return; }
++n_folded;
}
model.khad_pretransformed = (n_folded > 0);
LLAMA_LOG_INFO("============ %s: folded H into wv_b/wk_b_pp on %d layers\n", __func__, n_folded);
}
static void llm_scale_gate_inp_s(llama_model & model, bool uses_mmap) {
auto & hparams = model.hparams;
LLAMA_LOG_INFO("%s: n_embd = %d\n", __func__, hparams.n_embd);
std::vector<float> values(hparams.n_embd);
float scale = 1.0f/sqrtf((float)hparams.n_embd);
int n_host = 0;
for (auto & l : model.layers) {
auto gis = l.ffn_gate_inp_s;
if (!gis) continue;
bool is_host = ggml_backend_buffer_is_host(gis->buffer);
if (is_host) {
if (uses_mmap) {
++n_host;
} else {
auto val = (float *)gis->data;
for (int j = 0; j < hparams.n_embd; ++j) val[j] *= scale;
}
} else {
auto extra = (ggml_split_tensor_t *)gis->extra;
if (extra) {
ggml_backend_tensor_get(extra->splits[0], values.data(), 0, values.size()*sizeof(float));
} else {
ggml_backend_tensor_get(gis, values.data(), 0, values.size()*sizeof(float));
}
for (int j = 0; j < hparams.n_embd; ++j) values[j] *= scale;
if (extra) {
for (int id = 0; id < extra->n_device; ++id) {
ggml_backend_tensor_set(extra->splits[id], values.data(), 0, values.size()*sizeof(float));
}
} else {
ggml_backend_tensor_set(gis, values.data(), 0, values.size()*sizeof(float));
}
}
}
if (n_host > 0) {
model.aux_buffer.resize(n_host * hparams.n_embd * sizeof(float));
auto ptr = model.aux_buffer.data();
for (auto & l : model.layers) {
auto gis = l.ffn_gate_inp_s;
if (!gis) continue;
if (ggml_backend_buffer_is_host(gis->buffer)) {
auto val = (const float *)gis->data;
for (int j = 0; j < hparams.n_embd; ++j) ptr[j] = val[j] * scale;
gis->data = ptr;
ptr += hparams.n_embd;
}
}
}
}
static bool striequals(const char* a, const char* b) {
for (; *a && *b; a++, b++) {
if (std::tolower(*a) != std::tolower(*b)) {
return false;
}
}
return *a == *b;
}
ggml_backend_t llama_context::ggml_backend_by_name(const char* name) {
for (auto backend : backends) {
const char* backend_name = ggml_backend_name(backend);
if (striequals(backend_name, name)) {
return backend;
}
}
return nullptr;
}
static bool item_in_list(const std::vector<std::string>& devices, const char* name) {
for (auto& device : devices) {
if (striequals(device.c_str(), name)) {
return true;
}
}
return false;
}
static void ggml_backend_add_from_device(llama_context* ctx, ggml_backend_t backend) {
const char* name = ggml_backend_name(backend);
if (ctx->cparams.devices.size()) {
if (item_in_list(ctx->cparams.devices, name)) {
ctx->backends.push_back(backend);
}
} else {
ctx->backends.push_back(backend);
}
}
static bool is_model_split_supported(const llama_model & model) {
static std::unordered_set<llm_arch> k_supported = {
LLM_ARCH_LLAMA,
LLM_ARCH_QWEN3MOE,
LLM_ARCH_GLM4_MOE,
LLM_ARCH_MISTRAL3,
LLM_ARCH_COMMAND_R,
LLM_ARCH_COHERE2,
LLM_ARCH_COHERE2_MOE,
LLM_ARCH_MIMO2,
LLM_ARCH_QWEN3,
LLM_ARCH_QWEN3VL,
LLM_ARCH_HUNYUAN_MOE,
LLM_ARCH_OPENAI_MOE,
LLM_ARCH_ERNIE4_5_MOE,
LLM_ARCH_MINIMAX_M2,
LLM_ARCH_SEED_OSS,
LLM_ARCH_STEP35,
LLM_ARCH_LAGUNA,
LLM_ARCH_QWEN35,
LLM_ARCH_QWEN35MOE,
LLM_ARCH_GEMMA4,
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_GLM_DSA,
LLM_ARCH_MISTRAL4,
LLM_ARCH_MELLUM,
LLM_ARCH_LAGUNA,
};
auto it = k_supported.find(model.arch);
return it != k_supported.end();
}
struct expert_tensors {
ggml_tensor * up = nullptr;
ggml_tensor * gate = nullptr;
ggml_tensor * down = nullptr;
};
static std::pair<std::vector<double>, double> get_layer_sizes(const llama_model_loader & ml, const llama_model & model,
ggml_type cache_type_k, ggml_type cache_type_v, uint32_t max_ctx_size, int mla_attn, int n_seq_max, int n_ubatch,
int amb, int worst_case_tokens, bool flash_attn,
std::vector<expert_tensors> & experts) {
int n_layer = model.hparams.n_layer;
std::vector<double> result(n_layer+1, 0);
std::vector<double> compute(n_layer+1, 0);
struct mla_tensors {
ggml_tensor * wk_b = nullptr;
ggml_tensor * wv_b = nullptr;
ggml_tensor * wkv_b = nullptr;
};
std::vector<mla_tensors> mla_tensors;
bool has_mla = model.is_mla_model();
if (has_mla) {
mla_tensors.resize(n_layer);
}
experts.resize(n_layer);
std::vector<size_t> ffn_exps(n_layer, 0), ffn_shexp(n_layer, 0), ffn_dense(n_layer, 0);
std::vector<size_t> attn(n_layer, 0);
std::vector<bool> has_layer_norm(n_layer, false);
size_t ow_size = 0;
size_t embd_size = 0;
size_t output_misc_size = 0;
int gemma4_rope_layer = -1;
if (model.arch == LLM_ARCH_GEMMA4) {
for (int il = 0; il < n_layer; ++il) {
if (!model.hparams.swa_layers[il]) {
gemma4_rope_layer = il;
break;
}
}
}
for (int i = 0; i < ml.n_tensors; ++i) {
auto t = ml.get_weight(i)->tensor;
std::string name(t->name);
auto size = ggml_nbytes(t);
if (name == "token_embd.weight") {
embd_size = size;
continue;
}
if (name == "output.weight") {
ow_size = size;
continue;
}
if (name == "output_norm.weight") {
continue;
}
if (model.arch == LLM_ARCH_GEMMA4) {
if (name == "per_layer_token_embd.weight" ||
name == "per_layer_model_proj.weight" ||
name == "per_layer_proj_norm.weight") {
output_misc_size += size;
continue;
}
if (name == "rope_freqs.weight") {
if (gemma4_rope_layer >= 0) {
result[gemma4_rope_layer] += size;
}
continue;
}
}
if (name == "mtp_pre_proj.weight" || name == "mtp_post_proj.weight" ||
name == "mtp_centroids.weight" || name == "mtp_token_ordering.weight") {
continue;
}
auto pos = name.find("blk.");
if (pos != 0) {
LLAMA_LOG_WARN("Oops: tensor with strange name %s\n", name.c_str());
continue;
}
pos += 4;
auto pos1 = name.find('.', pos);
if (pos1 == std::string::npos) {
LLAMA_LOG_WARN("Oops: tensor with strange name %s\n", name.c_str());
continue;
}
auto layer_string = name.substr(pos, pos1-pos);
std::istringstream str(layer_string);
int il; str >> il;
if (str.fail()) {
LLAMA_LOG_WARN("Oops: failed to read layer index from %s for tensor %s\n", layer_string.c_str(), name.c_str());
}
if (il < 0 || il >= model.hparams.n_layer) {
LLAMA_LOG_WARN("Oops: strange layer index %d for tensor %s\n", il, name.c_str());
continue;
}
if (!model.mtp && model.hparams.nextn_predict_layers > 0 && il >= n_layer - model.hparams.nextn_predict_layers) {
continue;
}
result[il] += size;
if (auto pos = name.rfind(".bias"); pos < name.size() && name.size() - pos == 5) {
continue;
}
if (auto pos = name.rfind(".scale"); pos < name.size() && name.size() - pos == 6) {
continue;
}
bool is_mla = false;
if (has_mla) {
if (name.find("attn_k_b.weight") != std::string::npos) {
mla_tensors[il].wk_b = t;
is_mla = true;
}
else if (name.find("attn_v_b.weight") != std::string::npos) {
mla_tensors[il].wv_b = t;
is_mla = true;
}
else if (name.find("attn_kv_b.weight") != std::string::npos) {
mla_tensors[il].wkv_b = t;
is_mla = true;
}
}
if (!is_mla) {
auto ttype = llm_tensor_type(model.arch, name, il);
if (ttype == LLM_TENSOR_UNKNOWN) {
LLAMA_LOG_WARN("Oops: got unknown for tensor %s\n", name.c_str());
continue;
}
if (ttype == LLM_TENSOR_FFN_GATE_UP_EXPS || ttype == LLM_TENSOR_FFN_GATE_EXPS || ttype == LLM_TENSOR_FFN_UP_EXPS || ttype == LLM_TENSOR_FFN_DOWN_EXPS) {
auto size = t->ne[1] * n_ubatch * model.hparams.n_expert_used * sizeof(float);
ffn_exps[il] += size;
if (name.find(".bias") == std::string::npos) {
if (ttype == LLM_TENSOR_FFN_GATE_UP_EXPS || ttype == LLM_TENSOR_FFN_UP_EXPS) {
experts[il].up = t;
}
else if (ttype == LLM_TENSOR_FFN_GATE_EXPS) {
experts[il].gate = t;
}
else if (ttype == LLM_TENSOR_FFN_DOWN_EXPS) {
experts[il].down = t;
}
}
}
else if (ttype == LLM_TENSOR_FFN_UP_SHEXP || ttype == LLM_TENSOR_FFN_GATE_SHEXP || ttype == LLM_TENSOR_FFN_DOWN_SHEXP) {
ffn_shexp[il] += t->ne[1] * n_ubatch * sizeof(float);
}
else if (ttype == LLM_TENSOR_FFN_UP || ttype == LLM_TENSOR_FFN_GATE || ttype == LLM_TENSOR_FFN_DOWN) {
ffn_dense[il] += t->ne[1] * n_ubatch * sizeof(float);
}
else if (ttype == LLM_TENSOR_ATTN_Q || ttype == LLM_TENSOR_ATTN_K || ttype == LLM_TENSOR_ATTN_V) {
attn[il] += t->ne[1] * n_ubatch * sizeof(float);
}
else if (ttype == LLM_TENSOR_ATTN_OUT) {
attn[il] += t->ne[0] * n_ubatch * sizeof(float);
}
else if (ttype == LLM_TENSOR_ATTN_NORM || ttype == LLM_TENSOR_ATTN_NORM_2 || ttype == LLM_TENSOR_ATTN_OUT_NORM || ttype == LLM_TENSOR_ATTN_POST_NORM ||
ttype == LLM_TENSOR_FFN_NORM || ttype == LLM_TENSOR_FFN_POST_NORM || ttype == LLM_TENSOR_LAYER_OUT_NORM) {
has_layer_norm[il] = true;
}
}
}
for (int il = 0; il < n_layer; ++il) {
auto ffn = std::max(std::max(ffn_exps[il], ffn_shexp[il]), ffn_dense[il]);
auto inp = model.hparams.n_embd * n_ubatch * sizeof(float);
if (has_layer_norm[il]) inp *= 2;
ffn += inp;
compute[il] = std::max<double>(compute[il], ffn);
}
if (has_mla) {
for (int il = 0; il < n_layer; ++il) {
auto & mla = mla_tensors[il];
if (mla.wk_b && mla.wv_b && !mla.wkv_b) {
auto type = ggml_is_quantized(mla.wk_b->type) ? GGML_TYPE_Q8_0 : mla.wk_b->type;
auto wkv_b_size = ggml_row_size(type, mla.wv_b->ne[0]) * (mla.wv_b->ne[1] + mla.wk_b->ne[1]) * mla.wv_b->ne[2] * 2;
result[il] += wkv_b_size;
}
else if (mla.wkv_b) {
if (!mla.wk_b) result[il] += ggml_nbytes(mla.wkv_b)/2;
if (!mla.wv_b) result[il] += ggml_nbytes(mla.wkv_b)/2;
}
if (mla_attn == 3 && mla.wv_b) {
auto kv_f32_size = (2 * mla.wv_b->ne[1] * mla.wv_b->ne[2] * max_ctx_size * sizeof(float))/(1024.*1024.);
int n_head = mla.wv_b->ne[2];
int n_max_head = n_head;
if (amb > 0 && kv_f32_size > amb) {
n_max_head = 1;
for (int niter = 2; niter < n_head; ++niter) {
if (n_head % niter == 0 && kv_f32_size/niter <= amb) {
n_max_head = n_head/niter;
break;
}
}
}
kv_f32_size = 2. * mla.wv_b->ne[1] * n_max_head * max_ctx_size * sizeof(float);
compute[il] = std::max(compute[il], kv_f32_size);
}
}
}
if (!ow_size) ow_size = embd_size;
result[n_layer] = ow_size + output_misc_size;
LLAMA_LOG_INFO("------------------- Layer sizes:\n");
double tot_model = 0, tot_cache = 0, max_compute = 0;
for (int il = 0; il < n_layer; ++il) {
auto kv_size = model.cache_size(il, cache_type_k, cache_type_v, max_ctx_size, mla_attn, n_seq_max, flash_attn);
LLAMA_LOG_INFO("Layer %2d: %9.2f, %9.2f, %9.2f %9.2f MiB\n", il, result[il]/1024./1024., kv_size/1024./1024., (result[il] + kv_size)/1024./1024., compute[il]/1024./1024.);
max_compute = std::max(max_compute, compute[il]);
tot_model += result[il];
tot_cache += kv_size;
result[il] += kv_size;
}
int n_output = worst_case_tokens > 0 ? worst_case_tokens : n_ubatch;
size_t output_size = model.hparams.n_vocab * n_output * sizeof(float);
if (output_size < max_compute) output_size = max_compute;
output_size -= max_compute;
LLAMA_LOG_INFO("Layer %2d: %9.2f, %9.2f, %9.2f MiB (output layer)\n", n_layer, result[n_layer]/1024./1024., output_size/1024./1024., (result[n_layer] + output_size)/1024./1024.);
result[n_layer] += output_size;
tot_cache += output_size;
LLAMA_LOG_INFO("--------------------------------------------------------------------------\n");
LLAMA_LOG_INFO("Total : %9.2f, %9.2f, %9.2f MiB\n", tot_model/1024./1024., tot_cache/1024./1024., (tot_model + tot_cache)/1024./1024.);
return std::make_pair(std::move(result), max_compute);
}
static bool llm_load_tensors(
llama_model_loader & ml,
llama_model & model,
int n_gpu_layers,
int & mla_attn,
enum llama_split_mode split_mode,
int main_gpu,
int max_gpu,
const float * tensor_split,
ggml_type cache_type_k,
ggml_type cache_type_v,
ggml_type extra_output_type,
uint32_t max_ctx_size,
int n_seq_max,
int n_ubatch,
int amb,
int fit_margin,
const int * fit_margin_array,
int worst_case_tokens,
bool flash_attn,
bool use_mlock,
bool validate_quants,
bool mtp,
bool fit,
bool dry_run,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();
constexpr size_t k_default_mem_margin = 1024 * 1024 * 1024;
auto & hparams = model.hparams;
if (split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) {
const bool unsupported_gemma_split =
model.arch == LLM_ARCH_GEMMA4_MTP ||
model.arch == LLM_ARCH_GEMMA4_ASSISTANT ||
(model.arch == LLM_ARCH_GEMMA4 && hparams.n_embd_per_layer > 0);
if (unsupported_gemma_split) {
LLAMA_LOG_WARN("\n=========================================================\n");
LLAMA_LOG_WARN("Split mode 'graph' is not supported for %s\n",
(model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) ? "Gemma 4 MTP assistant"
: "this Gemma4 variant");
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
LLAMA_LOG_WARN("===========================================================\n\n");
split_mode = LLAMA_SPLIT_MODE_LAYER;
} else if (!is_model_split_supported(model)) {
LLAMA_LOG_WARN("\n=======================================================\n");
LLAMA_LOG_WARN("Split mode 'graph' is not supported for this model\n");
LLAMA_LOG_WARN(" => changing split mode to 'layer'\n");
LLAMA_LOG_WARN("=======================================================\n\n");
split_mode = LLAMA_SPLIT_MODE_LAYER;
} else {
if (model.arch == LLM_ARCH_MIMO2 && model.devices.size() > 4 && (max_gpu == 0 || max_gpu > 4)) {
LLAMA_LOG_WARN("\n================================================================\n");
LLAMA_LOG_WARN("Split mode 'graph' for Mimo2 does not work with more than 4 GPUs\n");
LLAMA_LOG_WARN(" => setting max_gpu to 4\n");
LLAMA_LOG_WARN("================================================================\n\n");
max_gpu = 4;
}
}
}
if (iqk_has_fancy_simd()) {
LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is defined\n");
} else {
LLAMA_LOG_INFO("======================================= HAVE_FANCY_SIMD is NOT defined\n");
}
if (max_ctx_size == 0) {
max_ctx_size = model.hparams.n_ctx_train;
}
if (fit) {
if (ml.tensor_buft_overrides) {
throw std::runtime_error("Manual tensor overrides cannot be used with --fit");
}
if (ml.ncmoe > 0) {
throw std::runtime_error("-ncmoe | --n-cpu-moe cannot be used with --fit");
}
n_gpu_layers = 999;
}
model.split_mode = split_mode;
model.main_gpu = main_gpu;
model.max_gpu = max_gpu;
model.n_gpu_layers = n_gpu_layers;
model.mtp = mtp;
size_t mem_margin = fit_margin > 0 ? size_t(fit_margin)*1024*1024 : k_default_mem_margin;
auto get_mem_margin = [mem_margin, fit_margin_array, n_gpu = int(model.devices.size()), func = __func__] (int gpu) {
size_t result = mem_margin;
bool have_margin_override = false;
if (fit_margin_array) {
for (int i = 0; ; i += 2) {
if (fit_margin_array[i] < 0) break;
if (fit_margin_array[i] == gpu && fit_margin_array[i+1] >= 0) {
result = size_t(fit_margin_array[i+1])*1024*1024;
have_margin_override = true;
}
}
}
if (have_margin_override) {
LLAMA_LOG_INFO("%s: using %0.2f MiB as fit margin for GPU %d\n", func, result/1024./1024., gpu);
}
return result;
};
const int n_layer = hparams.n_layer;
int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
bool use_mmap_buffer = true;
model.buft_input = llama_default_buffer_type_cpu(true);
model.buft_layer.resize(n_layer);
for (int i = 0; i < i_gpu_start; ++i) {
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
}
std::vector<size_t> device_mem(model.devices.size());
for (int i = 0; i < int(device_mem.size()); ++i) {
device_mem[i] = llama_get_device_memory(model, model.devices[i]);
auto this_margin = get_mem_margin(i);
if (device_mem[i] > this_margin) {
device_mem[i] -= this_margin;
} else {
LLAMA_LOG_WARN("Free memory %zu MiB on device %d is less the %zu MiB safety margin\n", device_mem[i]/(1024*1024), model.devices[i], this_margin/(1024*1024));
device_mem[i] = 0;
}
}
if (int device_count = model.devices.size(); device_count > 1) {
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
std::vector<float> splits(device_count);
if (all_zero) {
for (int i = 0; i < device_count; ++i) {
splits[i] = device_mem[i];
}
} else {
std::copy(tensor_split, tensor_split + device_count, splits.begin());
}
float split_sum = 0.0f;
for (int i = 0; i < device_count; ++i) {
split_sum += splits[i];
splits[i] = split_sum;
}
for (int i = 0; i < device_count; ++i) {
splits[i] /= split_sum;
}
model.splits = std::move(splits);
} else {
model.splits = { 1.0f };
}
int device_count = model.splits.size();
if (device_count < 2 && (model.split_mode == LLAMA_SPLIT_MODE_ATTN || model.split_mode == LLAMA_SPLIT_MODE_GRAPH)) {
throw std::runtime_error("It is not possible to use split mode 'graph' with less than 2 devices");
}
if (fit && device_count > 1) {
model.main_gpu = device_count - 1;
}
if (model.is_mla_model()) {
if (model.n_gpu_layers > 0 && model.n_gpu_layers < model.hparams.n_layer && mla_attn != 3) {
LLAMA_LOG_WARN("=============================================================================\n");
LLAMA_LOG_WARN("MLA models with ngl < n_layer and split mode graph do not work with mla = %d\n", mla_attn);
LLAMA_LOG_WARN(" => changing mla to 3\n");
LLAMA_LOG_WARN("=============================================================================\n");
mla_attn = 3;
}
}
model.default_layer_device = std::vector<int32_t>(hparams.n_layer+1, device_count-1);
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
std::vector<llama_model_tensor_buft_override> overrides;
if (device_count > 0) {
std::vector<expert_tensors> experts;
auto [layer_sizes, max_compute] = get_layer_sizes(ml, model, cache_type_k, cache_type_v, max_ctx_size, mla_attn, n_seq_max, n_ubatch,
amb, worst_case_tokens, flash_attn, experts);
size_t required_mem = 0;
for (int i = 0; i <= n_layer; ++i) {
required_mem += layer_sizes[i];
}
bool has_experts = false;
for (int il = 0; il < n_layer; ++il) {
if (experts[il].down || experts[il].up || experts[il].gate) {
has_experts = true;
break;
}
}
size_t available_mem = 0;
for (int id = 0; id < device_count; ++id) {
if (device_mem[id] > max_compute) {
available_mem += device_mem[id] - max_compute;
} else {
LLAMA_LOG_WARN("Free memory %zu MiB on device %d is less the required compute buffer size %g MiB\n", device_mem[id]/(1024*1024), id, max_compute/(1024*1024));
}
}
LLAMA_LOG_INFO("Memory required for model tensors + cache: %.f MiB\n", required_mem/(1024.*1024.));
LLAMA_LOG_INFO("Memory available on all devices - compute: %.f MiB\n", available_mem/(1024.*1024.));
if (required_mem > available_mem && !tensor_split) {
float sum = 0;
for (int id = 0; id < device_count; ++id) {
device_mem[id] = device_mem[id] > max_compute ? device_mem[id] - max_compute : 0;
sum += device_mem[id];
model.splits[id] = sum;
}
if (sum > 0) {
for (int id = 0; id < device_count; ++id) {
model.splits[id] /= sum;
}
}
}
if (device_count > 1) {
int n_last = n_layer;
if (n_gpu_layers > n_layer) ++n_last;
double sum = max_compute * device_count;
for (int i = i_gpu_start; i < n_last; ++i) sum += layer_sizes[i];
int last = i_gpu_start;
float loaded_sum = 0;
for (int id = 0; id < device_count; ++id) {
loaded_sum += max_compute;
float split_size = model.splits[id]*sum;
int il = last;
for (; il < n_last; ++il) {
if (loaded_sum + layer_sizes[il] <= split_size) {
model.default_layer_device[il] = id;
loaded_sum += layer_sizes[il];
if (required_mem <= available_mem) {
LLAMA_LOG_INFO("Setting default device in layer %2d to %d\n", il, id);
}
} else {
if (loaded_sum + layer_sizes[il] - split_size < split_size - loaded_sum) {
if (required_mem <= available_mem) {
LLAMA_LOG_INFO("Setting default device in layer %2d to %d\n", il, id);
}
model.default_layer_device[il] = id;
loaded_sum += layer_sizes[il++];
}
break;
}
}
last = il;
}
}
if (fit && required_mem > available_mem) {
auto buft = ggml_backend_cpu_buffer_type();
if (has_experts) {
if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) {
auto cur_mem = required_mem;
int n_override = 0;
for (int il = 0; il < n_layer; ++il) {
bool has_experts = false;
if (experts[il].down) {
cur_mem -= ggml_nbytes(experts[il].down);
has_experts = true;
}
if (experts[il].up) {
cur_mem -= ggml_nbytes(experts[il].up);
has_experts = true;
}
if (experts[il].gate) {
cur_mem -= ggml_nbytes(experts[il].gate);
has_experts = true;
}
if (has_experts) {
LLAMA_LOG_INFO("Adding experts CPU overrides for layer %d\n", il);
std::string pattern = "blk\\." + std::to_string(il) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.(weight|scale))";
auto & o = overrides.emplace_back();
o.pattern = strdup(pattern.c_str());
o.buft = buft;
++n_override;
}
if (cur_mem <= available_mem - max_compute) {
LLAMA_LOG_INFO("Estimated memory use for split mode `graph' is %zu MiB after adding %d overrides, which is less than available memory of %zu MiB\n",
cur_mem/(1024*1024), n_override, size_t(available_mem - max_compute)/(1024*1024));
break;
}
}
if (cur_mem > available_mem) {
if (n_override > 0) {
LLAMA_LOG_ERROR("Required memory %zu MiB is still greater than available memory %zu MiB after overriding all MoE tensors to CPU\n",
cur_mem/(1024*1024), available_mem/(1024*1024));
} else {
LLAMA_LOG_ERROR("No MoE tensors in model and required memory %zu MiB greater than available memory %zu MiB.\n",
cur_mem/(1024*1024), available_mem/(1024*1024));
LLAMA_LOG_ERROR("Note: auto-fit currently only works for MoE models\n");
}
throw std::runtime_error("Unable to auto-fit model");
}
int id = model.default_layer_device[n_layer];
if (device_mem[id] <= layer_sizes[n_layer]) {
LLAMA_LOG_ERROR("Not enough memory in device %d to offload the output layer\n", id);
throw std::runtime_error("Unable to auto-fit model");
}
device_mem[id] -= layer_sizes[n_layer];
if (!tensor_split) {
float sum = 0;
for (int id = 0; id < int(model.splits.size()); ++id) {
sum += device_mem[id];
model.splits[id] = sum;
}
if (sum > 0) {
LLAMA_LOG_INFO("=== Adjusted device splits for split mode graph:\n");
float last_split = 0;
for (int id = 0; id < int(model.splits.size()); ++id) {
model.splits[id] /= sum;
LLAMA_LOG_INFO("Device %2d: %zu MiB -> split = %g\n", id, device_mem[id]/(1024*1024), model.splits[id] - last_split);
last_split = model.splits[id];
}
} else {
throw std::runtime_error("Unable to auto-fit model");
}
}
} else {
for (int id = 0; id < device_count; ++id) {
size_t mem = 0;
for (int il = 0; il <= n_layer; ++il) {
if (model.default_layer_device[il] == id) {
mem += layer_sizes[il];
}
}
size_t cur_mem = mem;
int n_override = 0;
if (cur_mem > device_mem[id]) {
for (int il = n_layer-1; il >= 0; --il) {
if (model.default_layer_device[il] != id) continue;
bool has_experts = false;
if (experts[il].down) {
cur_mem -= ggml_nbytes(experts[il].down);
has_experts = true;
}
if (experts[il].up) {
cur_mem -= ggml_nbytes(experts[il].up);
has_experts = true;
}
if (experts[il].gate) {
cur_mem -= ggml_nbytes(experts[il].gate);
has_experts = true;
}
if (has_experts) {
LLAMA_LOG_INFO("Adding experts CPU overrides for layer %d in device %d\n", il, id);
std::string pattern = "blk\\." + std::to_string(il) + "\\.(ffn_(up|down|gate|gate_up)_exps\\.(weight|scale))";
auto & o = overrides.emplace_back();
o.pattern = strdup(pattern.c_str());
o.buft = buft;
++n_override;
}
if (cur_mem <= device_mem[id]) {
LLAMA_LOG_INFO("Memory use in device %d is %zu MiB after adding %d overrides, which is less than available memory of %zu MiB\n",
id, cur_mem/(1024*1024) ,n_override, device_mem[id]/(1024*1024));
break;
}
}
}
if (cur_mem > device_mem[id]) {
if (n_override > 0) {
LLAMA_LOG_ERROR("Required memory %zu MiB in device %d is still greater than available memory %zu MiB after overriding all MoE tensors to CPU\n",
cur_mem/(1024*1024), id, device_mem[id]/(1024*1024));
} else {
LLAMA_LOG_ERROR("No MoE tensors in layers assigned to device %d, and required memory %zu MiB greater than available memory %zu MiB.\n",
id, cur_mem/(1024*1024), available_mem/(1024*1024));
}
throw std::runtime_error("Unable to auto-fit model");
}
}
}
} else {
int id = model.devices.size() - 1;
for (int il = n_layer; il >= 0; --il) {
if (device_mem[id] >= layer_sizes[il]) {
device_mem[id] -= layer_sizes[il];
model.default_layer_device[il] = id;
LLAMA_LOG_INFO("Setting layer %d to device %d\n", il, id);
} else {
--id;
if (id >= 0) {
++il;
} else {
i_gpu_start = il+1;
break;
}
}
}
for (int il = 0; il < i_gpu_start; ++il) {
model.default_layer_device[il] = -1;
model.buft_layer[il] = llama_default_buffer_type_cpu(true);
}
}
}
}
if ((model.arch == LLM_ARCH_GEMMA4_MTP || model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && split_mode == LLAMA_SPLIT_MODE_LAYER && device_count > 0 && n_gpu_layers > 0) {
const int mtp_device = std::clamp(main_gpu, 0, device_count - 1);
LLAMA_LOG_INFO("%s: Gemma 4 MTP assistant forcing layer placement to GPU %d under layer split\n",
__func__, mtp_device);
for (int i = i_gpu_start; i < n_layer; ++i) {
model.default_layer_device[i] = mtp_device;
}
if (n_gpu_layers > n_layer) {
model.default_layer_device[n_layer] = mtp_device;
}
}
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
for (int i = i_gpu_start; i < n_layer; ++i) {
model.buft_layer[i] = llama_default_buffer_type_offload(model, model.devices[model.default_layer_device[i]]);
}
if (n_gpu_layers > n_layer) {
model.buft_output = llama_default_buffer_type_offload(model, model.devices[model.default_layer_device[n_layer]]);
} else {
model.buft_output = llama_default_buffer_type_cpu(true);
}
} else {
ggml_backend_buffer_type_t split_buft;
if ((split_mode == LLAMA_SPLIT_MODE_GRAPH || split_mode == LLAMA_SPLIT_MODE_ATTN) && model.splits.size() > 1) {
split_buft = llama_default_buffer_type_split(model, model.devices[main_gpu]);
model.split_buft = split_buft;
} else {
split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
}
for (int i = i_gpu_start; i < n_layer; ++i) {
auto buft_layer = llama_default_buffer_type_offload(model, model.devices[model.default_layer_device[i]]);
if (split_mode == LLAMA_SPLIT_MODE_ATTN) {
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count,
float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
model.buft_layer[i] = { split_buft, llama_default_buffer_type_offload(model, model.devices[layer_gpu]) };
LLAMA_LOG_INFO("Layer %d: assigning buft_layer to GPU %d\n", i, layer_gpu);
} else {
model.buft_layer[i] = { split_buft, buft_layer };
}
}
if (n_gpu_layers > n_layer) {
model.buft_output = {
split_buft,
llama_default_buffer_type_offload(model, model.devices[model.default_layer_device[n_layer]])
};
} else if (n_gpu_layers > 0) {
int last_gpu_layer = (int)n_layer - 1;
int dev = model.default_layer_device.empty() || last_gpu_layer < 0
? 0 : std::max(0, model.default_layer_device[last_gpu_layer]);
model.buft_output = llama_default_buffer_type_offload(model, dev);
} else {
model.buft_output = llama_default_buffer_type_cpu(true);
}
}
if (!overrides.empty()) {
auto & last = overrides.emplace_back();
last.pattern = nullptr;
ml.tensor_buft_overrides = overrides.data();
model.tensor_overrides = true;
}
auto cth = create_tensors_helper_interface::instance(ml, model);
auto ctx_size = cth->get_ctx_size();
auto & ctx_map = cth->get_ctx_map();
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
if (hparams.n_expert > 0 && hparams.n_expert_used == 0) {
throw std::runtime_error("model has expert layers but no expert layers are used");
}
use_mmap_buffer = cth->create_tensors();
if (!use_mmap_buffer) {
ml.use_mmap = false;
}
bool defer_expert_mmap = ml.should_defer_expert_mmaps();
if (defer_expert_mmap && use_mlock) {
LLAMA_LOG_WARN("%s: deferred expert loading disabled because mlock keeps mmap ranges resident\n", __func__);
defer_expert_mmap = false;
}
if (defer_expert_mmap && (ml.check_tensors || validate_quants)) {
LLAMA_LOG_WARN("%s: deferred expert loading disabled because tensor validation would fault expert pages eagerly\n", __func__);
defer_expert_mmap = false;
}
ml.done_getting_tensors();
ml.init_mappings(!defer_expert_mmap && !dry_run, use_mlock ? &model.mlock_mmaps : nullptr, ml.use_thp);
model.mappings.reserve(ml.mappings.size());
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
ctx_bufs.reserve(ctx_map.size());
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
model.bufs.reserve(n_max_backend_buffer);
for (auto & it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
llama_buf_map bufs;
bufs.reserve(n_max_backend_buffer);
if (ml.use_mmap && use_mmap_buffer && (buft == llama_default_buffer_type_cpu(true) || buft == ggml_backend_cpu_buffer_type())) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
void * addr = nullptr;
size_t first, last;
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
if (first >= last) {
continue;
}
ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
if (buf == nullptr) {
throw std::runtime_error("unable to allocate backend CPU buffer");
}
model.bufs.push_back(buf);
bufs.emplace(idx, buf);
#ifdef GGML_USE_CUDA
if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer(
ggml_backend_buffer_get_base(buf),
ggml_backend_buffer_get_size(buf));
}
#endif
}
}
#ifdef GGML_USE_METAL
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
const size_t max_size = ggml_get_max_tensor_size(ctx);
void * addr = nullptr;
size_t first, last;
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
if (first >= last) {
continue;
}
ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size);
if (buf == nullptr) {
throw std::runtime_error("unable to allocate backend metal buffer");
}
model.bufs.push_back(buf);
bufs.emplace(idx, buf);
}
}
#endif
else {
int ntensor = 0;
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
++ntensor;
}
if (ntensor > 0) {
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (buf == nullptr) {
LLAMA_LOG_ERROR("Failed to allocate buffer type %s\n", ggml_backend_buft_name(buft));
throw std::runtime_error("unable to allocate backend buffer");
}
model.bufs.push_back(buf);
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
model.mlock_bufs.emplace_back(new llama_mlock);
auto & mlock_buf = model.mlock_bufs.back();
mlock_buf->init (ggml_backend_buffer_get_base(buf));
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
}
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
bufs.emplace(idx, buf);
}
}
}
if (bufs.empty()) {
LLAMA_LOG_WARN("No tensors in buffer type %s\n", ggml_backend_buft_name(buft));
continue;
}
for (auto & buf : bufs) {
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
}
ctx_bufs.emplace_back(ctx, bufs);
}
if (llama_supports_gpu_offload()) {
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
if (n_gpu_layers > (int) hparams.n_layer) {
LLAMA_LOG_INFO("%s: offloading non-repeating layers to GPU\n", __func__);
}
const int max_backend_supported_layers = hparams.n_layer + 1;
const int max_offloadable_layers = hparams.n_layer + 1;
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
}
for (ggml_backend_buffer_t buf : model.bufs) {
LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
}
for (ggml_context * ctx : model.ctxs) {
for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
}
}
if (!dry_run) {
for (auto & it : ctx_bufs) {
ggml_context * ctx = it.first;
auto & bufs = it.second;
if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) {
return false;
}
}
if (defer_expert_mmap) {
ml.drop_mmap_expert_pages();
}
}
if (model.is_mla_model()) {
const bool graph_mode = (model.split_mode == LLAMA_SPLIT_MODE_GRAPH ||
model.split_mode == LLAMA_SPLIT_MODE_ATTN);
if (!dry_run || graph_mode) {
llm_prepare_mla(model, mla_attn);
}
}
if (model.arch == LLM_ARCH_GEMMA4) {
llm_scale_gate_inp_s(model, use_mmap_buffer);
}
if ((model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) && extra_output_type != GGML_TYPE_COUNT) {
llm_requantize_output_tensor(model, extra_output_type);
}
if (use_mmap_buffer) {
for (auto & mapping : ml.mappings) {
model.mappings.emplace_back(std::move(mapping));
}
}
if (!ml.use_mmap) {
int n_modified = 0;
for (auto& it : model.tensors_by_name) {
if (ggml_backend_buffer_is_host(it.second->buffer)) {
if (iqk_modify_tensor(it.second)) ++n_modified;
}
}
if (n_modified > 0) LLAMA_LOG_INFO("============ Modified %d tensors\n", n_modified);
}
if (validate_quants) {
int nbad = 0;
for (auto& it : model.tensors_by_name) {
if (ggml_backend_buffer_is_host(it.second->buffer)) {
if (!iqk_validate_tensor(it.second)) ++nbad;
}
}
if (nbad > 0) {
LLAMA_LOG_ERROR("Found %d bad tensors in model\n", nbad);
throw std::runtime_error("Bad tensors in model");
}
}
if (defer_expert_mmap && !dry_run) {
LLAMA_LOG_INFO("%s: dense parameters loaded in %.2fs (%.2f GiB), expert parameters deferred (%.2f GiB)\n",
__func__,
(ggml_time_us() - model.t_start_us) / 1000000.0,
ml.expert_tensor_index.dense_bytes / 1024.0 / 1024.0 / 1024.0,
ml.expert_tensor_index.deferred_bytes / 1024.0 / 1024.0 / 1024.0);
}
if (!ml.use_mmap && ml.repack_tensors) {
int n_repacked = 0;
for (auto& it : model.tensors_by_name) {
if (ggml_backend_buffer_is_host(it.second->buffer)) {
auto orig_type = it.second->type;
if (it.second->view_src) continue;
iqk_repack_tensor(it.second);
if (it.second->type != orig_type) ++n_repacked;
}
}
if (n_repacked > 0) LLAMA_LOG_INFO("============ Repacked %d tensors\n", n_repacked);
}
if (model.arch == LLM_ARCH_BITNET) {
auto set_scale = [] (ggml_tensor * w, ggml_tensor * s) {
if (!s) {
float one = 1;
std::memcpy(w->op_params, &one, sizeof(one));
return;
}
float scale = 1;
if (ggml_backend_buffer_is_host(s->buffer)) {
scale = *(const float *)s->data;
} else {
ggml_backend_tensor_get(s, &scale, 0, sizeof(float));
}
std::memcpy(w->op_params, &scale, sizeof(scale));
};
for (auto& l : model.layers) {
set_scale(l.ffn_up, l.ffn_up_scale);
set_scale(l.ffn_gate, l.ffn_gate_scale);
set_scale(l.ffn_down, l.ffn_down_scale);
set_scale(l.wq, l.wq_scale);
set_scale(l.wk, l.wk_scale);
set_scale(l.wv, l.wv_scale);
set_scale(l.wo, l.wo_scale);
}
}
if (!overrides.empty()) {
for (auto & o : overrides) {
if (o.pattern) free(const_cast<char*>(o.pattern));
}
ml.tensor_buft_overrides = nullptr;
}
model.t_load_us = ggml_time_us() - model.t_start_us;
return true;
}
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
try {
llama_model_loader ml(fname, params.ncmoe, params.use_mmap, params.check_tensors,
params.repack_tensors, params.use_thp, params.merge_qkv, params.merge_up_gate_exps,
params.defer_experts,
params.kv_overrides, params.tensor_buft_overrides);
model.hparams.vocab_only = params.vocab_only;
model.mtp = params.mtp;
try {
llm_load_arch(ml, model);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model architecture: " + std::string(e.what()));
}
try {
llm_load_hparams(ml, model);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
}
if (params.defer_experts && params.use_mmap) {
#ifdef __linux__
ml.build_expert_tensor_index(model.hparams);
#else
LLAMA_LOG_WARN("%s: deferred expert loading is only supported on Linux; ignoring defer_experts\n", __func__);
#endif
}
try {
LLM_KV kv(model.arch);
model.vocab.load(ml, kv);
} catch(const std::exception & e) {
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
}
llm_load_print_meta(ml, model);
if (model.vocab.get_type() != LLAMA_VOCAB_TYPE_NONE &&
model.hparams.n_vocab != model.vocab.n_tokens()) {
throw std::runtime_error("vocab size mismatch");
}
if (params.vocab_only) {
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
return 0;
}
#ifdef GGML_USE_KOMPUTE
if (params.n_gpu_layers > 0 && (
!(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
|| !(
model.ftype == LLAMA_FTYPE_ALL_F32 ||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
)
)) {
LLAMA_LOG_WARN("%s: disabling Kompute due to unsupported model arch or quantization\n", __func__);
params.n_gpu_layers = 0;
}
#endif
if (!llm_load_tensors(
ml, model, params.n_gpu_layers, params.mla, params.split_mode, params.main_gpu, params.max_gpu, params.tensor_split,
params.type_k, params.type_v, params.extra_output_type,
params.max_ctx_size, params.n_seq_max, params.n_ubatch, params.amb, params.fit_margin, params.fit_margin_array,
params.worst_graph_tokens, params.flash_attn,
params.use_mlock, params.validate_quants, params.mtp, params.fit, params.dry_run,
params.progress_callback, params.progress_callback_user_data
)) {
return -2;
}
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
return -1;
}
return 0;
}
static void llama_set_k_shift(llama_context & lctx) {
const int64_t kv_size = lctx.kv_self.size;
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
for (int i = 0; i < kv_size; ++i) {
data[i] = lctx.kv_self.cells[i].delta;
}
}
static void llama_set_s_copy(llama_context & lctx) {
const int64_t kv_size = lctx.kv_self.size;
assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
int32_t * data = (int32_t *) lctx.inp_s_copy->data;
for (int i = 0; i < kv_size; ++i) {
data[i] = lctx.kv_self.cells[i].src;
}
}
static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
const int64_t max_distance = 128;
if (bidirectional) {
n_buckets >>= 1;
}
const int64_t max_exact = n_buckets >> 1;
int32_t relative_position = x - y;
int32_t relative_bucket = 0;
if (bidirectional) {
relative_bucket += (relative_position > 0) * n_buckets;
relative_position = abs(relative_position);
} else {
relative_position = -std::min<int32_t>(relative_position, 0);
}
int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
return relative_bucket;
}
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
const auto & hparams = lctx.model.hparams;
const auto & cparams = lctx.cparams;
const auto & kv_self = lctx.kv_self;
if (batch.token && lctx.inp_tokens) {
#if IK_PRINT_TIMING == 2
auto tim1 = ggml_time_us();
#endif
const int64_t n_tokens = batch.n_tokens;
ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
#if IK_PRINT_TIMING == 2
auto tim2 = ggml_time_us();
printf("set_inputs(token): %d us\n", int(tim2-tim1));
#endif
}
if (batch.embd) {
#if IK_PRINT_TIMING == 2
auto tim1 = ggml_time_us();
#endif
const int64_t n_embd = hparams.n_embd;
const int64_t n_tokens = batch.n_tokens;
ggml_backend_tensor_set(lctx.inp_embd, batch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
#if IK_PRINT_TIMING == 2
auto tim2 = ggml_time_us();
printf("set_inputs(embd): %d us\n", int(tim2-tim1));
#endif
}
if (batch.pos && lctx.inp_pos) {
#if IK_PRINT_TIMING == 2
auto tim1 = ggml_time_us();
#endif
const int64_t n_tokens = batch.n_tokens;
const int n_pos_per_embd = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(lctx.inp_pos));
#if IK_PRINT_TIMING == 2
auto tim2 = ggml_time_us();
printf("set_inputs(pos): %d us\n", int(tim2-tim1));
#endif
}
if (lctx.inp_pos && lctx.inp_scale) {
#if IK_PRINT_TIMING == 2
auto tim1 = ggml_time_us();
#endif
int n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_nelements(lctx.inp_scale) >= n_tokens);
if (int(lctx.scale_data.size()) < n_tokens) lctx.scale_data.resize(n_tokens);
int n_pos_per_token = 1;
for (int i = 0; i < n_tokens; ++i) {
lctx.scale_data[i] = std::log(std::floor((batch.pos[i] + 1.0f) / hparams.n_attn_temp_floor_scale) + 1.0f) * hparams.f_attn_temp_scale + 1.0f;
}
ggml_backend_tensor_set(lctx.inp_scale, lctx.scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(lctx.inp_scale));
#if IK_PRINT_TIMING == 2
auto tim2 = ggml_time_us();
printf("set_inputs(scale): %d us\n", int(tim2-tim1));
#endif
}
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
#if IK_PRINT_TIMING == 2
auto tim1 = ggml_time_us();
#endif
const int64_t n_tokens = batch.n_tokens;
if (n_tokens > 1 && !cparams.mtp && lctx.n_outputs < n_tokens) {
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
}
if (lctx.inp_out_ids && lctx.inp_out_ids->buffer) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
if (lctx.n_outputs == n_tokens) {
for (int i = 0; i < n_tokens; ++i) {
data[i] = i;
}
} else if (batch.logits) {
int32_t n_outputs = 0;
for (int i = 0; i < n_tokens; ++i) {
if (batch.logits[i]) {
data[n_outputs++] = i;
}
}
GGML_ASSERT(lctx.n_outputs == n_outputs);
} else if (lctx.n_outputs == 1) {
data[0] = n_tokens - 1;
} else {
GGML_ASSERT(lctx.n_outputs == 0);
}
}
#if IK_PRINT_TIMING == 2
auto tim2 = ggml_time_us();
printf("set_inputs(outputs): %d us\n", int(tim2-tim1));
#endif
}
GGML_ASSERT(
(hparams.causal_attn || !cparams.causal_attn) &&
"causal attention is not supported by this model"
);
if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
#if IK_PRINT_TIMING == 2
auto tim1 = ggml_time_us();
#endif
if (cparams.causal_attn && !lctx.is_encoding) {
const llama_kv_cache & mask_kv_self =
((lctx.model.arch == LLM_ARCH_GEMMA4_MTP || lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && lctx.mtp_target_ctx != nullptr)
? lctx.mtp_target_ctx->kv_self
: kv_self;
const int64_t n_kv = mask_kv_self.n;
const int64_t n_tokens = batch.n_tokens;
float * data = nullptr;
float * data_swa = nullptr;
ggml_half * data_f16 = nullptr;
ggml_half * data_swa_f16 = nullptr;
if (lctx.inp_KQ_mask) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
if (cparams.flash_attn) {
data_f16 = (ggml_half *)lctx.inp_KQ_mask->data;
} else {
data = (float *) lctx.inp_KQ_mask->data;
}
}
if (lctx.inp_KQ_mask_swa) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
if (cparams.flash_attn) {
data_swa_f16 = (ggml_half *) lctx.inp_KQ_mask_swa->data;
} else {
data_swa = (float *) lctx.inp_KQ_mask_swa->data;
}
}
auto noalibi_f16 = [&mask_kv_self, &hparams, n_kv, data_f16, data_swa_f16] (int j, llama_pos pos, llama_seq_id seq_id, int first, int last) {
ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
ggml_half h_zero = ggml_fp32_to_fp16(0.f);
for (int i = first; i < last; ++i) {
ggml_half h = !mask_kv_self.cells[i].has_seq_id(seq_id) || mask_kv_self.cells[i].pos > pos ? h_inf : h_zero;
if (data_f16) data_f16[j*n_kv + i] = h;
if (data_swa_f16) {
if (h != h_inf) {
if (hparams.n_attn_chunk) {
llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
if (mask_kv_self.cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
h = h_inf;
}
} else {
if (pos - mask_kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
h = h_inf;
}
}
}
data_swa_f16[j*n_kv + i] = h;
}
}
};
if (n_kv >= 1024 && n_tokens >= 32) {
int n_thread = std::max(1, int(std::thread::hardware_concurrency()/2));
int npt = (n_kv + n_thread - 1)/n_thread;
auto compute = [&batch, &mask_kv_self, &hparams, &cparams, &noalibi_f16, n_tokens, n_kv, npt, data, data_swa, data_f16, data_swa_f16] (int ith) {
int first = ith * npt;
int last = std::min(int(n_kv), first + npt);
if (last <= first) return;
for (int j = 0; j < n_tokens; ++j) {
const llama_pos pos = batch.pos[j];
const llama_seq_id seq_id = batch.seq_id[j][0];
if (!hparams.use_alibi && cparams.flash_attn) {
noalibi_f16(j, pos, seq_id, first, last);
continue;
}
for (int i = first; i < last; ++i) {
float f;
if (!mask_kv_self.cells[i].has_seq_id(seq_id) || mask_kv_self.cells[i].pos > pos) {
f = -INFINITY;
} else {
if (hparams.use_alibi) {
f = -std::abs(mask_kv_self.cells[i].pos - pos);
} else {
f = 0.0f;
}
}
if (data) {
data[j*n_kv + i] = f;
}
if (data_f16) {
data_f16[j*n_kv + i] = ggml_fp32_to_fp16(f);
}
if (data_swa || data_swa_f16) {
if (f > -INFINITY) {
if (hparams.n_attn_chunk) {
llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
if (mask_kv_self.cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
f = -INFINITY;
}
} else {
if (pos - mask_kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
f = -INFINITY;
}
}
}
if (data_swa) {
data_swa[j*n_kv + i] = f;
}
if (data_swa_f16) {
data_swa_f16[j*n_kv + i] = ggml_fp32_to_fp16(f);
}
}
}
}
};
std::vector<std::thread> workers(n_thread-1);
int it = 0;
for (auto& w : workers) w = std::thread(compute, it++);
compute(it);
for (auto& w : workers) w.join();
int64_t n_tokens_padded = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD);
if (n_tokens_padded > n_tokens) {
if (data) {
std::fill(data + int64_t(n_tokens)*n_kv, data + n_tokens_padded*n_kv, -INFINITY);
}
if (data_f16) {
ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
std::fill(data_f16 + int64_t(n_tokens)*n_kv, data_f16 + n_tokens_padded*n_kv, h_inf);
}
if (data_swa) {
std::fill(data_swa + int64_t(n_tokens)*n_kv, data_swa + n_tokens_padded*n_kv, -INFINITY);
}
if (data_swa_f16) {
ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
std::fill(data_swa_f16 + int64_t(n_tokens)*n_kv, data_swa_f16 + n_tokens_padded*n_kv, h_inf);
}
}
}
else {
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
const llama_pos pos = batch.pos[j];
const llama_seq_id seq_id = batch.seq_id[j][0];
if (!hparams.use_alibi && cparams.flash_attn) {
noalibi_f16(j, pos, seq_id, 0, n_kv);
continue;
}
for (int i = 0; i < n_kv; ++i) {
float f;
if (!mask_kv_self.cells[i].has_seq_id(seq_id) || mask_kv_self.cells[i].pos > pos) {
f = -INFINITY;
} else {
if (hparams.use_alibi) {
f = -std::abs(mask_kv_self.cells[i].pos - pos);
} else {
f = 0.0f;
}
}
if (data) {
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
}
if (data_f16) {
data_f16[h*(n_kv*n_tokens) + j*n_kv + i] = ggml_fp32_to_fp16(f);
}
if (data_swa || data_swa_f16) {
if (hparams.n_attn_chunk) {
llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
if (mask_kv_self.cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
f = -INFINITY;
}
} else {
if (pos - mask_kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
f = -INFINITY;
}
}
if (data_swa) {
data_swa[h*(n_kv*n_tokens) + j*n_kv + i] = f;
}
if (data_swa_f16) {
data_swa_f16[h*(n_kv*n_tokens) + j*n_kv + i] = ggml_fp32_to_fp16(f);
}
}
}
}
int64_t n_tokens_padded = GGML_PAD(n_tokens, GGML_KQ_MASK_PAD);
if (n_tokens_padded > n_tokens) {
if (data) {
std::fill(data + int64_t(n_tokens)*n_kv, data + n_tokens_padded*n_kv, -INFINITY);
}
if (data_f16) {
ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
std::fill(data_f16 + int64_t(n_tokens)*n_kv, data_f16 + n_tokens_padded*n_kv, h_inf);
}
if (data_swa) {
std::fill(data_swa + int64_t(n_tokens)*n_kv, data_swa + n_tokens_padded*n_kv, -INFINITY);
}
if (data_swa_f16) {
ggml_half h_inf = ggml_fp32_to_fp16(-INFINITY);
std::fill(data_swa_f16 + int64_t(n_tokens)*n_kv, data_swa_f16 + n_tokens_padded*n_kv, h_inf);
}
}
}
}
#if IK_PRINT_TIMING == 2
auto tim2 = ggml_time_us();
printf("set_inputs(mask1): %d us\n", int(tim2-tim1));
#endif
} else {
const int64_t n_tokens = batch.n_tokens;
const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
float * data = nullptr;
float * data_swa = nullptr;
ggml_half * data_f16 = nullptr;
ggml_half * data_swa_f16 = nullptr;
if (lctx.inp_KQ_mask) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
if (cparams.flash_attn) {
data_f16 = (ggml_half *)lctx.inp_KQ_mask->data;
} else {
data = (float *) lctx.inp_KQ_mask->data;
}
}
if (lctx.inp_KQ_mask_swa) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
if (cparams.flash_attn) {
data_swa_f16 = (ggml_half *) lctx.inp_KQ_mask_swa->data;
} else {
data_swa = (float *) lctx.inp_KQ_mask_swa->data;
}
}
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
const llama_seq_id seq_id = batch.seq_id[j][0];
for (int i = 0; i < n_tokens; ++i) {
float f = -INFINITY;
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id[i][s] == seq_id) {
if (hparams.use_alibi) {
f = -std::abs(batch.pos[i] - batch.pos[j]);
} else {
f = 0.0f;
}
break;
}
}
if (data) {
data[h*(n_tokens*n_tokens) + j*n_stride + i] = f;
}
if (data_f16) {
data_f16[h*(n_tokens*n_tokens) + j*n_tokens + i] = ggml_fp32_to_fp16(f);
}
if (data_swa || data_swa_f16) {
if (hparams.n_attn_chunk) {
llama_pos pos_chunk_start = (batch.pos[i] / hparams.n_attn_chunk) * hparams.n_attn_chunk;
if (lctx.kv_self.cells[i].pos < pos_chunk_start || batch.pos[i] < pos_chunk_start) {
f = -INFINITY;
}
} else {
if (batch.pos[i] - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
f = -INFINITY;
}
}
if (data_swa) {
data_swa[h*(n_tokens*n_tokens) + j*n_tokens + i] = f;
}
if (data_swa_f16) {
data_swa_f16[h*(n_tokens*n_tokens) + j*n_tokens + i] = ggml_fp32_to_fp16(f);
}
}
}
if (data) {
for (int i = n_tokens; i < n_stride; ++i) {
data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY;
}
}
if (data_f16) {
auto h_inf = ggml_fp32_to_fp16(-INFINITY);
for (int i = n_tokens; i < n_stride; ++i) {
data_f16[h*(n_tokens*n_tokens) + j*n_stride + i] = h_inf;
}
}
}
}
#if IK_PRINT_TIMING == 2
auto tim2 = ggml_time_us();
printf("set_inputs(mask2): %d us\n", int(tim2-tim1));
#endif
}
}
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(lctx.inp_mean);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
float * data = (float *) lctx.inp_mean->data;
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
std::vector<uint64_t> sum(n_tokens, 0);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
sum[seq_id] += 1;
}
std::vector<float> div(n_tokens, 0.0f);
for (int i = 0; i < n_tokens; ++i) {
const uint64_t s = sum[i];
if (s > 0) {
div[i] = 1.0f/float(s);
}
}
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
data[seq_id*n_tokens + i] = div[seq_id];
}
}
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(lctx.inp_cls);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS");
if (pos == 0) {
data[seq_id] = i;
}
}
}
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(lctx.inp_cls);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
std::vector<int> last_pos(n_tokens, -1);
std::vector<int> last_row(n_tokens, -1);
for (int i = 0; i < n_tokens; ++i) {
const llama_seq_id seq_id = batch.seq_id[i][0];
const llama_pos pos = batch.pos[i];
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
if (pos >= last_pos[seq_id]) {
last_pos[seq_id] = pos;
last_row[seq_id] = i;
}
}
for (int i = 0; i < n_tokens; ++i) {
if (last_row[i] >= 0) {
data[i] = last_row[i];
}
}
}
if (kv_self.recurrent) {
const int64_t n_kv = kv_self.n;
if (lctx.inp_s_mask) {
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
float * data = (float *) lctx.inp_s_mask->data;
for (int i = 0; i < n_kv; ++i) {
llama_seq_id seq_id = i + lctx.kv_self.head;
llama_kv_cell & kv_cell = lctx.kv_self.cells[seq_id];
bool has_self_seq = kv_cell.has_seq_id(seq_id);
data[i] = (float) has_self_seq;
if (!has_self_seq && kv_cell.pos >= 0) {
kv_cell.seq_id.insert(seq_id);
}
}
}
if (lctx.inp_s_seq) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer));
int32_t * data = (int32_t *) lctx.inp_s_seq->data;
for (int j = 0; j < n_tokens; ++j) {
const int32_t n_seq = batch.n_seq_id[j];
GGML_ASSERT(0 < n_seq);
for (int i = 0; i < n_kv; ++i) {
if (i < n_seq) {
data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head;
} else {
data[j*n_kv + i] = -1;
}
}
}
}
}
if (lctx.inp_s_seq_qnext) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq_qnext->buffer));
int32_t * data = (int32_t *) lctx.inp_s_seq_qnext->data;
for (int64_t j = 0; j < n_tokens; ++j) {
data[j] = 0;
}
}
if (lctx.inp_pos_bucket) {
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
if (!lctx.is_encoding) {
const int64_t n_kv = kv_self.n;
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_kv; ++i) {
data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
}
}
}
} else {
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_tokens; ++i) {
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(batch.pos[i], batch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
}
}
}
}
}
if (!lctx.is_encoding && lctx.inp_embd_enc) {
assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());
ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
}
if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
const int64_t n_tokens = batch.n_tokens;
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
float * data = (float *) lctx.inp_KQ_mask_cross->data;
for (int h = 0; h < 1; ++h) {
for (int j = 0; j < n_tokens; ++j) {
for (int i = 0; i < n_output_enc; ++i) {
float f = -INFINITY;
for (int s = 0; s < batch.n_seq_id[j]; ++s) {
const llama_seq_id seq_id = batch.seq_id[j][s];
if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
f = 0.0f;
}
}
data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
}
}
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
for (int j = 0; j < n_output_enc; ++j) {
data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
}
}
}
}
}
static uint32_t llama_output_embd_width(const llama_context & lctx) {
return llama_mtp_state_n_embd(&lctx);
}
static bool llama_context_has_mtp_outputs(const llama_context & lctx) {
return lctx.cparams.mtp && (
lctx.model.hparams.nextn_predict_layers > 0 ||
lctx.model.arch == LLM_ARCH_GEMMA4 ||
lctx.model.arch == LLM_ARCH_GEMMA4_MTP ||
lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT);
}
static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
const auto & cparams = lctx.cparams;
const auto & hparams = lctx.model.hparams;
const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
const auto n_batch = cparams.n_batch;
const auto n_vocab = hparams.n_vocab;
const auto n_embd = llama_output_embd_width(lctx);
const bool has_mtp = llama_context_has_mtp_outputs(lctx);
const bool has_logits = !cparams.embeddings || has_mtp;
const bool has_embd = lctx.is_encoding || (cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE)) || has_mtp;
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
if (lctx.output_ids.empty()) {
lctx.output_ids.resize(n_batch);
}
const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
const size_t new_size = (logits_size + embd_size) * sizeof(float);
if (!lctx.buf_output || prev_size < new_size) {
if (lctx.buf_output) {
#ifndef NDEBUG
LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
ggml_backend_buffer_free(lctx.buf_output);
lctx.buf_output = nullptr;
lctx.logits = nullptr;
lctx.embd = nullptr;
}
lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
if (lctx.buf_output == nullptr) {
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
return 0;
}
}
float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output);
lctx.logits = has_logits ? output_base : nullptr;
lctx.embd = has_embd ? output_base + logits_size : nullptr;
lctx.output_size = n_outputs_max;
lctx.logits_size = logits_size;
lctx.embd_size = embd_size;
std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
if (false) {
if (has_mtp) {
const size_t clear_size = (logits_size + embd_size) * sizeof(float);
if (clear_size > 0 && output_base) {
memset(output_base, 0, clear_size);
}
} else {
ggml_backend_buffer_clear(lctx.buf_output, 0);
}
}
lctx.n_outputs = 0;
return n_outputs_max;
}
static void llama_graph_compute(
llama_context & lctx,
ggml_cgraph * gf,
int n_threads) {
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(lctx.backend_metal)) {
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
}
#endif
if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
#ifdef GGML_USE_BLAS
if (lctx.backend_blas != nullptr) {
ggml_backend_blas_set_n_threads(lctx.backend_blas, n_threads);
}
#endif
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
}
static bool prepare_mtp_graph_inputs(
struct llama_context & lctx,
uint32_t cur_token,
uint32_t n_tokens,
uint32_t n_tokens_all) {
ggml_tensor * dst = lctx.inp_mtp_states;
const size_t expected_floats = ggml_nbytes(dst) / sizeof(float);
const size_t total_floats = lctx.draft_input_hidden_state_n_floats;
const float * src = lctx.draft_input_hidden_state;
if (!src) {
LLAMA_LOG_ERROR("%s: Source hidden state is null\n", __func__);
return false;
}
if (total_floats != expected_floats) {
if (n_tokens_all == 0 || total_floats % n_tokens_all != 0) {
LLAMA_LOG_ERROR("%s: Source hidden state size mismatch (have %zu floats, need %zu)\n",
__func__, total_floats, expected_floats);
return false;
}
const size_t row_width = total_floats / n_tokens_all;
const size_t slice_floats = row_width * n_tokens;
const size_t slice_offset = (size_t) cur_token * row_width;
if (slice_floats != expected_floats ||
slice_offset > total_floats ||
total_floats - slice_offset < expected_floats) {
LLAMA_LOG_ERROR("%s: Source hidden state size mismatch (have %zu floats, need %zu)\n",
__func__, total_floats, expected_floats);
return false;
}
src += slice_offset;
}
if (src == nullptr) {
LLAMA_LOG_ERROR("%s: Source hidden state size mismatch (have %zu floats, need %zu)\n",
__func__, total_floats, expected_floats);
return false;
}
ggml_backend_tensor_set(dst, src, 0, ggml_nbytes(dst));
return true;
}
static int llama_decode_internal(
llama_context & lctx,
llama_batch batch_all) {
lctx.is_encoding = false;
const uint32_t n_tokens_all = batch_all.n_tokens;
if (n_tokens_all == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
return -1;
}
#if IK_PRINT_TIMING > 2
printf("===== %s: %ld\n", __func__, ggml_time_us());
#endif
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd));
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
if (lctx.t_compute_start_us == 0) {
lctx.t_compute_start_us = ggml_time_us();
}
lctx.n_queued_tokens += n_tokens_all;
auto & kv_self = lctx.kv_self;
const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab;
uint32_t n_outputs = 0;
uint32_t n_outputs_prev = 0;
uint32_t n_outputs_embd = 0;
uint32_t n_outputs_prev_embd = 0;
const auto n_ubatch = cparams.n_ubatch;
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id_arr;
std::vector<std::vector<llama_seq_id>> seq_id;
const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
const bool has_mtp = llama_context_has_mtp_outputs(lctx);
const uint32_t n_embd_output = llama_output_embd_width(lctx);
if (batch_all.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
n_outputs += batch_all.logits[i] != 0;
}
} else if (lctx.logits_all || embd_pooled) {
n_outputs = n_tokens_all;
} else {
n_outputs = 1;
}
n_outputs_embd = has_mtp && cparams.mtp_op_type == MTP_OP_NONE ? n_tokens_all : n_outputs;
if (llama_output_reserve(lctx, std::max<size_t>(n_outputs, n_outputs_embd)) < std::max<size_t>(n_outputs, n_outputs_embd)) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %zu outputs\n", __func__, std::max<size_t>(n_outputs, n_outputs_embd));
return -2;
};
if (batch_all.logits) {
int32_t i_logits = 0;
for (uint32_t i = 0; i < n_tokens_all; ++i) {
if (batch_all.logits[i]) {
lctx.output_ids[i] = i_logits++;
} else {
lctx.output_ids[i] = -1;
}
}
} else if (n_outputs == 1 && n_tokens_all > 0) {
for (uint32_t i = 0; i < n_tokens_all; ++i) {
lctx.output_ids[i] = -1;
}
lctx.output_ids[n_tokens_all - 1] = 0;
} else {
for (uint32_t i = 0; i < std::max<uint32_t>(n_outputs, n_outputs_embd); ++i) {
lctx.output_ids[i] = i;
}
}
bool warned_qnext_mixed_repeat = false;
for (uint32_t cur_token = 0; cur_token < n_tokens_all; ) {
#if IK_PRINT_TIMING
auto tim1 = ggml_time_us();
#endif
uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
if (llm_arch_is_hybrid(model.arch) &&
n_tokens > 1 &&
batch_all.n_seq_id != nullptr &&
batch_all.seq_id != nullptr) {
bool can_check = true;
bool any_diff = false;
bool has_dup = false;
llama_seq_id first_seq_id = 0;
std::unordered_set<llama_seq_id> seen_seq_ids;
seen_seq_ids.reserve(n_tokens);
for (uint32_t i = 0; i < n_tokens; ++i) {
const uint32_t idx = cur_token + i;
if (batch_all.n_seq_id[idx] <= 0 || batch_all.seq_id[idx] == nullptr) {
can_check = false;
break;
}
const llama_seq_id seq_id_i = batch_all.seq_id[idx][0];
if (i == 0) {
first_seq_id = seq_id_i;
} else if (seq_id_i != first_seq_id) {
any_diff = true;
}
if (!seen_seq_ids.insert(seq_id_i).second) {
has_dup = true;
}
}
if (can_check && any_diff && has_dup) {
n_tokens = 1;
if (!warned_qnext_mixed_repeat) {
LLAMA_LOG_WARN("%s: qwen3next mixed-sequence batch contains repeated seq_id values; falling back to single-token chunking\n", __func__);
warned_qnext_mixed_repeat = true;
}
}
}
const uint8_t rope_params_per_token = (hparams.rope_type == LLAMA_ROPE_TYPE_MROPE ||
hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE) ? 4 : 1;
llama_pos * u_batch_pos;
if (!batch_all.pos) {
u_batch_pos = nullptr;
} else if (rope_params_per_token == 1) {
u_batch_pos = batch_all.pos + cur_token;
} else {
pos.resize((size_t) n_tokens * rope_params_per_token);
u_batch_pos = pos.data();
for (uint32_t i = 0; i < n_tokens; ++i) {
if (batch_all.token) {
pos[ i] = batch_all.pos[cur_token + i]; pos[ n_tokens + i] = batch_all.pos[cur_token + i]; pos[2 * n_tokens + i] = batch_all.pos[cur_token + i]; pos[3 * n_tokens + i] = 0;
} else { pos[ i] = batch_all.pos[ cur_token + i]; pos[ n_tokens + i] = batch_all.pos[ n_tokens_all + cur_token + i]; pos[2 * n_tokens + i] = batch_all.pos[2 * n_tokens_all + cur_token + i]; pos[3 * n_tokens + i] = batch_all.pos[3 * n_tokens_all + cur_token + i]; }
}
}
llama_batch u_batch = {
(int32_t) n_tokens,
batch_all.token ? batch_all.token + cur_token : nullptr,
batch_all.embd ? batch_all.embd + cur_token*n_embd : nullptr,
u_batch_pos,
batch_all.n_seq_id ? batch_all.n_seq_id + cur_token : nullptr,
batch_all.seq_id ? batch_all.seq_id + cur_token : nullptr,
batch_all.logits ? batch_all.logits + cur_token : nullptr,
batch_all.all_pos_0 + (llama_pos) cur_token*batch_all.all_pos_1,
batch_all.all_pos_1,
batch_all.all_seq_id,
};
{
int32_t n_outputs_new = 0;
if (u_batch.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens; i++) {
n_outputs_new += u_batch.logits[i] != 0;
}
} else if (n_outputs == n_tokens_all) {
n_outputs_new = n_tokens;
} else {
if (cur_token + n_tokens >= n_tokens_all) {
n_outputs_new = 1;
}
}
lctx.n_outputs = n_outputs_new;
}
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
GGML_ASSERT(n_threads > 0);
if (u_batch.pos == nullptr) {
pos.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
pos[i] = u_batch.all_pos_0 + i*u_batch.all_pos_1;
}
u_batch.pos = pos.data();
}
if (u_batch.seq_id == nullptr) {
n_seq_id.resize(n_tokens);
seq_id.resize(n_tokens);
seq_id_arr.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
n_seq_id[i] = 1;
seq_id[i].resize(1);
seq_id[i][0] = u_batch.all_seq_id;
seq_id_arr[i] = seq_id[i].data();
}
u_batch.n_seq_id = n_seq_id.data();
u_batch.seq_id = seq_id_arr.data();
}
if (hparams.causal_attn) {
int32_t ret = llama_kv_cache_update(&lctx);
if (ret != 0) {
return ret;
}
if (kv_self.head > kv_self.used + 2*n_tokens) {
kv_self.head = 0;
}
if (!llama_kv_cache_find_slot(kv_self, u_batch)) {
return 1;
}
if (!kv_self.recurrent) {
const uint32_t pad = llama_kv_cache_get_padding(cparams);
auto max_cell = llama_kv_cache_cell_max(kv_self, pad);
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(max_cell, pad)));
}
}
#if IK_PRINT_TIMING
auto tim2 = ggml_time_us();
printf("prelude(...): %d us\n", int(tim2-tim1));
#endif
#if IK_PRINT_TIMING
tim1 = ggml_time_us();
#endif
auto & prev = cparams.mtp_op_type == MTP_OP_NONE ? lctx.prev : lctx.prev_mtp;
ggml_cgraph * gf = nullptr;
if (!lctx.can_reuse_graph(u_batch)) {
lctx.reset_scheduler();
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
#if IK_PRINT_TIMING
tim2 = ggml_time_us();
printf("sched_reset(...): %d us\n", int(tim2-tim1));
#endif
#if IK_PRINT_TIMING
tim1 = ggml_time_us();
#endif
gf = llm_build_context::llama_build_graph(lctx, u_batch, false);
#if IK_PRINT_TIMING
tim2 = ggml_time_us();
printf("build_graph(...): %d us\n", int(tim2-tim1));
#endif
#if IK_PRINT_TIMING
tim1 = ggml_time_us();
#endif
ggml_backend_sched_alloc_graph(lctx.sched, gf);
#if IK_PRINT_TIMING
tim2 = ggml_time_us();
printf("sched_alloc_graph(...): %d us\n", int(tim2-tim1));
#endif
if (u_batch.embd == nullptr && lctx.cparams.graph_reuse &&
!((lctx.model.arch == LLM_ARCH_GEMMA4_MTP || lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && lctx.mtp_target_ctx != nullptr)) {
prev = std::make_unique<llama_context::Prev>(llama_context::Prev{
(int)u_batch.all_seq_id, (int)lctx.n_outputs, (int)lctx.kv_self.n,
(int)u_batch.n_tokens,
lctx.kv_self.save_per_step_ssm, lctx.kv_self.ckpt.per_step_max_allocated,
cparams.mtp_op_type, gf});
}
} else {
gf = prev->graph;
}
if (cparams.mtp_op_type != MTP_OP_NONE) {
if (!prepare_mtp_graph_inputs(lctx, cur_token, n_tokens, n_tokens_all)) {
return GGML_STATUS_FAILED;
}
}
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embd = nullptr;
if (lctx.n_outputs == 0) {
res = nullptr;
}
else {
const bool has_mtp = llama_context_has_mtp_outputs(lctx);
const bool use_raw_mtp_embd = has_mtp && (lctx.model.arch == LLM_ARCH_GEMMA4 ||
lctx.model.arch == LLM_ARCH_GEMMA4_MTP||
lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT);
if (cparams.embeddings || has_mtp) {
for (int i = gf->n_nodes - 1; i >= 0; --i) {
if (use_raw_mtp_embd && strcmp(gf->nodes[i]->name, "result_mtp_embd") == 0) {
embd = gf->nodes[i];
break;
}
if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
embd = gf->nodes[i];
break;
}
if (strcmp(gf->nodes[i]->name, "result_norm") == 0) {
embd = gf->nodes[i];
break;
}
}
}
if (cparams.embeddings && lctx.model.hparams.nextn_predict_layers == 0 && !has_mtp) {
res = nullptr; } else {
if (!embd) { GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
}
}
}
#if IK_PRINT_TIMING == 1
tim1 = ggml_time_us();
#endif
llama_set_inputs(lctx, u_batch);
#if IK_PRINT_TIMING == 1
tim2 = ggml_time_us();
printf("set_inputs(...): %d us\n", int(tim2-tim1));
#endif
#if IK_PRINT_TIMING
tim1 = ggml_time_us();
#endif
llama_graph_compute(lctx, gf, n_threads);
#if IK_PRINT_TIMING
llama_synchronize(&lctx);
tim2 = ggml_time_us();
printf("graph_compute(...): %d us\n", int(tim2-tim1));
#endif
bool reset_previous = false;
{
if (llama_model_has_recurrent(&lctx.model) && kv_self.head == 0) {
reset_previous = true;
}
kv_self.head += n_tokens;
if (kv_self.head >= kv_self.size) {
kv_self.head = 0;
}
}
if (res) {
#if IK_PRINT_TIMING
tim1 = ggml_time_us();
#endif
if (cparams.mtp_op_type != MTP_OP_WARMUP) { ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
GGML_ASSERT(backend_res != nullptr);
GGML_ASSERT(lctx.logits != nullptr);
float * logits_out = lctx.logits + n_outputs_prev*n_vocab;
const int32_t n_outputs_new = lctx.n_outputs;
if (n_outputs_new) {
GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs);
GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size);
if (res->ne[1] == n_tokens && n_outputs_new < n_tokens) {
int32_t i_out = 0;
if (u_batch.logits && !embd_pooled) {
for (uint32_t i = 0; i < n_tokens; i++) {
if (u_batch.logits[i]) {
ggml_backend_tensor_get_async(backend_res, res, logits_out + i_out*n_vocab, i*n_vocab*sizeof(float), n_vocab*sizeof(float));
i_out++;
}
}
} else if (cur_token + n_tokens >= n_tokens_all) {
ggml_backend_tensor_get_async(backend_res, res, logits_out, (n_tokens - 1)*n_vocab*sizeof(float), n_vocab*sizeof(float));
}
} else {
ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float));
}
}
}
#if IK_PRINT_TIMING
tim2 = ggml_time_us();
printf("get_result(...): %d us\n", int(tim2-tim1));
#endif
}
if (embd && cparams.mtp_op_type != MTP_OP_WARMUP) {
#if IK_PRINT_TIMING
tim1 = ggml_time_us();
#endif
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
GGML_ASSERT(backend_embd != nullptr);
switch (cparams.pooling_type) {
case LLAMA_POOLING_TYPE_NONE:
{
GGML_ASSERT(lctx.embd != nullptr);
float * embd_out = lctx.embd + n_outputs_prev_embd*n_embd_output;
const int32_t n_outputs_new_embd = has_mtp ? embd->ne[1] : lctx.n_outputs;
if (n_outputs_new_embd) {
GGML_ASSERT( n_outputs_prev_embd + n_outputs_new_embd <= n_outputs_embd);
GGML_ASSERT((n_outputs_prev_embd + n_outputs_new_embd)*n_embd_output <= (int64_t) lctx.embd_size);
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new_embd*n_embd_output*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_MEAN:
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_LAST:
{
auto & embd_seq_out = lctx.embd_seq;
embd_seq_out.clear();
for (uint32_t i = 0; i < n_tokens; i++) {
const llama_seq_id seq_id = u_batch.seq_id[i][0];
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
continue;
}
embd_seq_out[seq_id].resize(n_embd);
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ABORT("unknown pooling type");
}
}
#if IK_PRINT_TIMING
tim2 = ggml_time_us();
printf("get_embedding(...): %d us\n", int(tim2-tim1));
#endif
}
n_outputs_prev += lctx.n_outputs;
n_outputs_prev_embd += (has_mtp && embd) ? embd->ne[1] : lctx.n_outputs;
cur_token += n_tokens;
if (reset_previous) {
lctx.prev.reset();
}
if (stop_internal_decode) {
return -3;
}
}
lctx.n_outputs = n_outputs;
lctx.n_outputs_embd = n_outputs_embd;
if (cparams.causal_attn && cparams.defrag_thold >= 0.0f) {
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used)/float(kv_self.n) : 0.0f;
if (fragmentation > cparams.defrag_thold) {
LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
llama_kv_cache_defrag(kv_self);
}
}
#if IK_PRINT_TIMING
auto tim1 = ggml_time_us();
#endif
if (!lctx.prev) {
lctx.reset_scheduler();
}
#if IK_PRINT_TIMING
auto tim2 = ggml_time_us();
printf("sched_reset(...): %d us\n", int(tim2-tim1));
#endif
return 0;
}
static int llama_encode_internal(
llama_context & lctx,
llama_batch batch) {
lctx.is_encoding = true;
const uint32_t n_tokens = batch.n_tokens;
if (n_tokens == 0) {
LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__);
return -1;
}
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & cparams = lctx.cparams;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd));
GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
if (lctx.t_compute_start_us == 0) {
lctx.t_compute_start_us = ggml_time_us();
}
lctx.n_queued_tokens += n_tokens;
const int64_t n_embd = hparams.n_embd;
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id *> seq_id_arr;
std::vector<std::vector<llama_seq_id>> seq_id;
if (llama_output_reserve(lctx, n_tokens) < n_tokens) {
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
return -2;
};
for (uint32_t i = 0; i < n_tokens; ++i) {
lctx.output_ids[i] = i;
}
lctx.inp_embd_enc = NULL;
lctx.n_outputs = n_tokens;
const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
GGML_ASSERT(n_threads > 0);
if (batch.pos == nullptr) {
pos.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
pos[i] = batch.all_pos_0 + i*batch.all_pos_1;
}
batch.pos = pos.data();
}
if (batch.seq_id == nullptr) {
n_seq_id.resize(n_tokens);
seq_id.resize(n_tokens);
seq_id_arr.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
n_seq_id[i] = 1;
seq_id[i].resize(1);
seq_id[i][0] = batch.all_seq_id;
seq_id_arr[i] = seq_id[i].data();
}
batch.n_seq_id = n_seq_id.data();
batch.seq_id = seq_id_arr.data();
}
lctx.reset_scheduler();
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
ggml_cgraph * gf = llm_build_context::llama_build_graph(lctx, batch, false);
struct ggml_tensor * embd = nullptr;
if (llama_model_has_decoder(&lctx.model)) {
embd = gf->nodes[gf->n_nodes - 1];
GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
} else {
if (cparams.embeddings) {
embd = gf->nodes[gf->n_nodes - 1];
if (strcmp(embd->name, "result_embd_pooled") != 0) {
embd = gf->nodes[gf->n_nodes - 2];
}
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
}
}
ggml_backend_sched_alloc_graph(lctx.sched, gf);
llama_set_inputs(lctx, batch);
llama_graph_compute(lctx, gf, n_threads);
if (embd) {
if (cparams.mtp_op_type == MTP_OP_NONE) {
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
GGML_ASSERT(backend_embd != nullptr);
if (llama_model_has_decoder(&lctx.model)) {
lctx.embd_enc.resize(n_tokens*n_embd);
float * embd_out = lctx.embd_enc.data();
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
lctx.seq_ids_enc.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
for (int s = 0; s < batch.n_seq_id[i]; s++) {
llama_seq_id seq_id = batch.seq_id[i][s];
lctx.seq_ids_enc[i].insert(seq_id);
}
}
} else {
GGML_ASSERT(lctx.embd != nullptr);
switch (cparams.pooling_type) {
case LLAMA_POOLING_TYPE_NONE:
{
GGML_ASSERT(lctx.embd != nullptr);
float * embd_out = lctx.embd;
GGML_ASSERT(n_tokens*n_embd <= (int64_t) lctx.embd_size);
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_tokens*n_embd*sizeof(float));
} break;
case LLAMA_POOLING_TYPE_MEAN:
case LLAMA_POOLING_TYPE_CLS:
case LLAMA_POOLING_TYPE_LAST:
{
auto & embd_seq_out = lctx.embd_seq;
embd_seq_out.clear();
for (uint32_t i = 0; i < n_tokens; i++) {
const llama_seq_id seq_id = batch.seq_id[i][0];
if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
continue;
}
embd_seq_out[seq_id].resize(n_embd);
ggml_backend_tensor_get_async(backend_embd, embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
}
} break;
case LLAMA_POOLING_TYPE_UNSPECIFIED:
{
GGML_ABORT("unknown pooling type");
}
}
}
}
}
lctx.reset_scheduler();
return 0;
}
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
auto & kv_self = lctx.kv_self;
const auto & hparams = lctx.model.hparams;
const uint32_t n_layer = hparams.n_layer;
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
const uint32_t n_used = kv_self.used;
assert(n_used <= n_kv);
uint32_t n_moves = 0;
const uint32_t max_moves = (lctx.model.max_nodes(1) - 2*n_layer)/(6*n_layer);
std::vector<uint32_t> ids(n_kv, n_kv);
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
const auto & cell0 = kv_self.cells[i0];
if (!cell0.is_empty()) {
ids[i0] = i0;
continue;
}
uint32_t nh = 1;
while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
nh++;
}
uint32_t nf = 0;
uint32_t is = n_kv - 1;
for (; is > i0; --is) {
const auto & cell1 = kv_self.cells[is];
if (cell1.is_empty() || ids[is] != n_kv) {
continue;
}
nf++;
if (nf == nh) {
break;
}
}
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
nf = 0;
uint32_t i1 = is;
bool cont = false;
bool stop = false;
for (; i1 < n_kv; ++i1) {
auto & cell1 = kv_self.cells[i1];
if (cell1.is_empty() || ids[i1] != n_kv) {
if (n_moves == max_moves) {
stop = true;
break;
}
cont = false;
continue;
}
ids[i1] = i0 + nf;
kv_self.cells[i0 + nf] = cell1;
cell1 = llama_kv_cell();
kv_self.head = n_used;
if (!cont) {
n_moves++;
cont = true;
}
nf++;
if (nf == nh) {
break;
}
}
if (stop || n_moves == max_moves) {
break;
}
i0 += nh - 1;
}
if (n_moves == 0) {
return;
}
#if 0#else
lctx.reset_scheduler();
ggml_cgraph * gf = llm_build_context::llama_build_graph_defrag(lctx, ids);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
#endif
}
static bool get_can_shift(struct llama_context & lctx) {
bool no_shift = lctx.model.is_mla_model();
no_shift = no_shift || lctx.model.hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE;
return !no_shift;
}
static int32_t llama_kv_cache_update_internal(struct llama_context & lctx) {
bool need_reserve = false;
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
if (!get_can_shift(lctx)) {
return 1;
}
{
lctx.reset_scheduler();
ggml_cgraph * gf = llm_build_context::llama_build_graph_k_shift(lctx);
ggml_backend_sched_alloc_graph(lctx.sched, gf);
llama_set_k_shift(lctx);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
need_reserve = true;
}
{
auto & kv_self = lctx.kv_self;
kv_self.has_shift = false;
for (uint32_t i = 0; i < kv_self.size; ++i) {
kv_self.cells[i].delta = 0;
}
}
}
if ((lctx.kv_self.recurrent || llama_kv_has_qnext_state_storage(lctx.kv_self)) && lctx.kv_self.do_copy) {
{
lctx.reset_scheduler();
ggml_cgraph * gf = llm_build_context::llama_build_graph_s_copy(lctx);
ggml_backend_sched_alloc_graph(lctx.sched, gf);
llama_set_s_copy(lctx);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
need_reserve = true;
}
{
auto & kv_self = lctx.kv_self;
kv_self.do_copy = false;
for (uint32_t i = 0; i < kv_self.size; ++i) {
kv_self.cells[i].src = i;
}
}
}
if (lctx.kv_self.do_defrag) {
llama_kv_cache_defrag_internal(lctx);
need_reserve = true;
lctx.kv_self.do_defrag = false;
}
if (need_reserve) {
int n_tokens = (int)std::min(lctx.cparams.n_ctx, lctx.cparams.n_ubatch);
int n_past = lctx.cparams.n_ctx - n_tokens;
llama_token token = llama_token_bos(&lctx.model); ggml_cgraph * gf = llm_build_context::llama_build_graph(lctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true, lctx.cparams.worst_graph_tokens);
lctx.reset_scheduler();
if (!ggml_backend_sched_reserve(lctx.sched, gf)) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
}
}
return 0;
}
static void llama_lora_adapter_init_internal(struct llama_model * model, const char * path_lora, struct llama_lora_adapter & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
ggml_context * ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
true,
&ctx,
};
struct gguf_context * ctx_gguf = gguf_init_from_file(path_lora, meta_gguf_params);
if (!ctx_gguf) {
throw std::runtime_error("failed to load lora adapter file from " + std::string(path_lora));
}
{
auto get_kv_str = [&](const std::string & key) -> std::string {
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
};
auto get_kv_f32 = [&](const std::string & key) -> float {
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? 0.0f : gguf_get_val_f32(ctx_gguf, id);
};
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
auto general_type = get_kv_str(llm_kv(LLM_KV_GENERAL_TYPE));
if (general_type != "adapter") {
gguf_free(ctx_gguf);
throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
}
auto general_arch_str = get_kv_str(llm_kv(LLM_KV_GENERAL_ARCHITECTURE));
auto general_arch = llm_arch_from_string(general_arch_str);
if (general_arch != model->arch) {
gguf_free(ctx_gguf);
throw std::runtime_error("model arch and LoRA arch mismatch");
}
auto adapter_type = get_kv_str(llm_kv(LLM_KV_ADAPTER_TYPE));
if (adapter_type != "lora") {
gguf_free(ctx_gguf);
throw std::runtime_error("expect adapter.type to be 'lora', but got: " + adapter_type);
}
adapter.alpha = get_kv_f32(llm_kv(LLM_KV_ADAPTER_LORA_ALPHA));
}
int n_tensors = gguf_get_n_tensors(ctx_gguf);
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
auto get_ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
auto it = ctx_map.find(buft);
if (it == ctx_map.end()) {
struct ggml_init_params params = {
n_tensors*ggml_tensor_overhead(),
NULL,
true,
};
ggml_context * buft_ctx = ggml_init(params);
ctx_map[buft] = buft_ctx;
return buft_ctx;
};
return it->second;
};
std::map<std::string, llama_lora_weight> ab_map;
auto str_endswith = [](const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
};
for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
std::string name(cur->name);
if (str_endswith(name, ".lora_a")) {
replace_all(name, ".lora_a", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(cur, nullptr);
} else {
ab_map[name].a = cur;
}
} else if (str_endswith(name, ".lora_b")) {
replace_all(name, ".lora_b", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(nullptr, cur);
} else {
ab_map[name].b = cur;
}
} else {
gguf_free(ctx_gguf);
ggml_free(ctx);
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
}
}
for (auto & it : ab_map) {
const std::string & name = it.first;
llama_lora_weight & w = it.second;
if (!w.a || !w.b) {
gguf_free(ctx_gguf);
ggml_free(ctx);
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
}
auto * model_tensor = llama_get_model_tensor(model, name.c_str());
if (!model_tensor) {
gguf_free(ctx_gguf);
ggml_free(ctx);
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
}
struct ggml_context * dev_ctx = get_ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
gguf_free(ctx_gguf);
ggml_free(ctx);
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
}
if (w.a->ne[1] != w.b->ne[0]) {
gguf_free(ctx_gguf);
ggml_free(ctx);
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
}
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
ggml_set_name(tensor_a, w.a->name);
ggml_set_name(tensor_b, w.b->name);
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
}
{
adapter.ctxs.reserve(ctx_map.size());
adapter.bufs.reserve(ctx_map.size());
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx_dev = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_dev, buft);
if (!buf) {
gguf_free(ctx_gguf);
ggml_free(ctx);
throw std::runtime_error("failed to allocate buffer for lora adapter\n");
}
LLAMA_LOG_INFO("%s: %10s LoRA buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
adapter.ctxs.push_back(ctx_dev);
adapter.bufs.push_back(buf);
}
}
{
llama_file gguf_file(path_lora, "rb");
std::vector<uint8_t> read_buf;
auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
size_t offs = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, gguf_find_tensor(ctx_gguf, orig->name));
size_t size = ggml_nbytes(orig);
read_buf.resize(size);
gguf_file.seek(offs, SEEK_SET);
gguf_file.read_raw(read_buf.data(), size);
ggml_backend_tensor_set(dev, read_buf.data(), 0, size);
};
for (auto & it : adapter.ab_map) {
auto orig = ab_map[it.first];
auto dev = it.second;
set_tensor(orig.a, dev.a);
set_tensor(orig.b, dev.b);
}
}
LLAMA_LOG_INFO("%s: loaded %ld tensors from lora file\n", __func__, adapter.ab_map.size()*2);
gguf_free(ctx_gguf);
ggml_free(ctx);
}
int32_t llama_lora_adapter_set(
struct llama_context * ctx,
struct llama_lora_adapter * adapter,
float scale) {
if (ctx->cparams.flash_attn) {
LLAMA_LOG_ERROR("%s: flash_attn is not compatible with LoRA\n", __func__);
return -1;
}
ctx->lora_adapters[adapter] = scale;
return 0;
}
int32_t llama_lora_adapter_remove(
struct llama_context * ctx,
struct llama_lora_adapter * adapter) {
auto pos = ctx->lora_adapters.find(adapter);
if (pos != ctx->lora_adapters.end()) {
ctx->lora_adapters.erase(pos);
return 0;
}
return -1;
}
void llama_lora_adapter_clear(struct llama_context * ctx) {
ctx->lora_adapters.clear();
}
void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
delete adapter;
}
struct llama_model_params llama_model_default_params() {
struct llama_model_params result = {
nullptr,
0,
0,
LLAMA_SPLIT_MODE_LAYER,
0,
0,
0,
GGML_TYPE_F16,
GGML_TYPE_F16,
0,
1,
512,
0,
0,
false,
0,
GGML_TYPE_F16,
GGML_TYPE_F16,
GGML_TYPE_F16,
GGML_TYPE_F16,
-1,
-1,
-1,
-1,
GGML_TYPE_COUNT,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
false,
true,
false,
false,
false,
false,
false,
false,
false,
false,
false,
true,
false,
};
#ifdef GGML_USE_METAL
result.n_gpu_layers = 999;
#endif
return result;
}
struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
LLAMA_DEFAULT_SEED,
512,
2048,
512,
1,
GGML_DEFAULT_N_THREADS, GGML_DEFAULT_N_THREADS,
256,
0,
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
LLAMA_POOLING_TYPE_UNSPECIFIED,
LLAMA_ATTENTION_TYPE_UNSPECIFIED,
0.0f,
0.0f,
-1.0f,
-1.0f,
-1.0f,
-1.0f,
0,
-1.0f,
nullptr,
nullptr,
GGML_TYPE_F16,
GGML_TYPE_F16,
GGML_TYPE_F16,
GGML_TYPE_F16,
GGML_TYPE_F16,
GGML_TYPE_F16,
GGML_TYPE_F16,
GGML_TYPE_F16,
-1,
-1,
-1,
-1,
false,
false,
true,
true,
3,
0,
true,
false,
true,
true,
false,
true,
-1,
0.0f,
false,
false,
false,
false,
false,
false,
MTP_OP_NONE,
nullptr,
nullptr,
nullptr,
nullptr,
};
return result;
}
struct llama_model_quantize_params llama_model_quantize_default_params() {
struct llama_model_quantize_params result = {
0,
LLAMA_FTYPE_MOSTLY_Q5_1,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
GGML_TYPE_COUNT,
false,
true,
false,
false,
false,
false,
false,
false,
false,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
};
return result;
}
size_t llama_max_devices(void) {
#if defined(GGML_USE_RPC)
return GGML_RPC_MAX_SERVERS;
#elif defined(GGML_USE_METAL)
return 1;
#elif defined(GGML_USE_CUDA)
return GGML_CUDA_MAX_DEVICES;
#elif defined(GGML_USE_SYCL)
return GGML_SYCL_MAX_DEVICES;
#elif defined(GGML_USE_VULKAN)
return GGML_VK_MAX_DEVICES;
#elif defined(GGML_USE_CANN)
return GGML_CANN_MAX_DEVICES;
#else
return 1;
#endif
}
bool llama_supports_mmap(void) {
return llama_mmap::SUPPORTED;
}
bool llama_supports_mlock(void) {
return llama_mlock::SUPPORTED;
}
bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
return true;
#else
return false;
#endif
}
void llama_backend_init(void) {
ggml_time_init();
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
}
void llama_numa_init(enum ggml_numa_strategy numa) {
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
ggml_numa_init(numa);
}
}
void llama_backend_free(void) {
ggml_quantize_free();
}
int64_t llama_time_us(void) {
return ggml_time_us();
}
static std::string create_rpc_name(std::string endpoint, uint32_t device) {
std::string dev_name = "RPC" + std::to_string(device) + "[" + std::string(endpoint) + "]";
return dev_name;
}
struct llama_model * llama_model_load_from_file(
const char * path_model,
struct llama_model_params params) {
ggml_time_init();
llama_model * model = new llama_model;
unsigned cur_percentage = 0;
if (params.progress_callback == NULL) {
params.progress_callback_user_data = &cur_percentage;
params.progress_callback = [](float progress, void * ctx) {
unsigned * cur_percentage_p = (unsigned *) ctx;
unsigned percentage = (unsigned) (100 * progress);
while (percentage > *cur_percentage_p) {
*cur_percentage_p = percentage;
LLAMA_LOG_INFO(".");
if (percentage >= 100) {
LLAMA_LOG_INFO("\n");
}
}
return true;
};
}
model->set_tensor_overrides(params);
std::vector<std::string> params_devices;
if (params.devices && !striequals(params.devices, "")) {
params_devices = string_split(params.devices, ",");
}
std::map<std::string, int32_t> buffer_names;
std::vector<std::string> gpu_names;
bool has_rpc = params.rpc_servers != nullptr && params.rpc_servers[0] != '\0';
int32_t idx = 0;
int dev_count = (int)llama_get_device_count(*model);
for (idx = 0; idx < dev_count; idx++) {
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, idx);
const char* name = ggml_backend_buft_name(buft);
buffer_names.insert({ std::string(name), idx });
gpu_names.push_back(std::string(name));
}
if (has_rpc) {
model->rpc_servers = extract_device_from_rpc_device(string_split(params.rpc_servers, ","));
for (auto rpc : model->rpc_servers) {
buffer_names.insert({ create_rpc_name(rpc.endpoint, rpc.device), idx});
idx++;
}
}
std::vector<std::string> device_names;
if (params_devices.size()) {
device_names = params_devices;
} else {
if (has_rpc) {
for (auto& it : model->rpc_servers) {
device_names.push_back(create_rpc_name(it.endpoint, it.device));
}
}
device_names.insert(device_names.end(), gpu_names.begin(), gpu_names.end());
}
for (auto & device : device_names) {
if (buffer_names.count(device)) {
model->devices.push_back(buffer_names[device]);
} else {
LLAMA_LOG_ERROR("%s backend not available.\n", device.c_str());
}
}
if (!model->devices.size()) {
params.n_gpu_layers = 0;
LLAMA_LOG_INFO("CPU: using device CPU\n");
} else {
for (auto i : model->devices) {
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i);
const char* name = ggml_backend_buft_name(buft);
const char* description = name;
size_t description_size = llama_get_device_memory(*model, i);
LLAMA_LOG_INFO("%s: using device %s - %zu MiB free\n",
name, description,
description_size / 1024 / 1024);
}
}
int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
delete model;
return nullptr;
}
return model;
}
void llama_free_model(struct llama_model * model) {
delete model;
}
static void llama_repack_up_gate_exps(llama_context & lctx) {
if (lctx.cparams.mtp_op_type != MTP_OP_NONE) {
return;
}
auto & model = lctx.model;
bool needs_repack = false;
for (auto & l : model.layers) {
if (l.ffn_up_gate_exps && l.ffn_up_exps && l.ffn_gate_exps &&
!l.ffn_up_gate_exps->extra) {
needs_repack = true; break;
}
}
if (!needs_repack) return;
std::vector<char> aux_buffer_up, aux_buffer_gate, aux_buffer_up_gate;
for (int il = 0; il < int(model.layers.size()); ++il) {
auto & l = model.layers[il];
if (l.ffn_up_gate_exps && l.ffn_up_exps && l.ffn_gate_exps &&
!l.ffn_up_gate_exps->extra) {
GGML_ASSERT(l.ffn_up_exps->type == l.ffn_gate_exps->type);
if (l.ffn_up_gate_exps->type != l.ffn_up_exps->type) {
auto [other_type, _] = interleaved_properties(l.ffn_up_gate_exps->type);
GGML_ASSERT(other_type == l.ffn_up_exps->type);
}
GGML_ASSERT(l.ffn_up_gate_exps->ne[0] == l.ffn_up_exps->ne[0] && l.ffn_up_gate_exps->ne[0] == l.ffn_gate_exps->ne[0]);
GGML_ASSERT(l.ffn_up_gate_exps->ne[2] == l.ffn_up_exps->ne[2] && l.ffn_up_gate_exps->ne[2] == l.ffn_gate_exps->ne[2]);
GGML_ASSERT(l.ffn_up_gate_exps->ne[1] == l.ffn_up_exps->ne[1] + l.ffn_gate_exps->ne[1]);
auto nbytes = ggml_nbytes(l.ffn_up_exps);
GGML_ASSERT(nbytes == ggml_nbytes(l.ffn_gate_exps));
if (nbytes > aux_buffer_up.size()) {
aux_buffer_up.resize(nbytes);
}
if (nbytes > aux_buffer_gate.size()) {
aux_buffer_gate.resize(nbytes);
}
LLAMA_LOG_INFO("%s: repacking up/gate experts weight in layer %d\n", __func__, il);
ggml_backend_tensor_get(l.ffn_up_exps, aux_buffer_up.data(), 0, nbytes);
ggml_backend_tensor_get(l.ffn_gate_exps, aux_buffer_gate.data(), 0, nbytes);
if (aux_buffer_up_gate.size() < 2*nbytes) {
aux_buffer_up_gate.resize(2*nbytes);
}
size_t offset_up_gate = 0;
size_t offset_up = 0;
auto expert_size = l.ffn_up_exps->ne[1]*l.ffn_up_exps->nb[1];
for (int i2 = 0; i2 < (int)l.ffn_up_gate_exps->ne[2]; ++i2) {
std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_gate.data() + offset_up, expert_size);
offset_up_gate += expert_size;
std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_up.data() + offset_up, expert_size);
offset_up_gate += expert_size;
offset_up += expert_size;
}
ggml_backend_tensor_set(l.ffn_up_gate_exps, aux_buffer_up_gate.data(), 0, 2*expert_size*l.ffn_up_gate_exps->ne[2]);
if (l.ffn_up_gate_exps_b && l.ffn_up_exps_b && l.ffn_gate_exps_b) {
nbytes = ggml_nbytes(l.ffn_up_exps_b);
GGML_ASSERT(nbytes == ggml_nbytes(l.ffn_gate_exps_b));
if (nbytes > aux_buffer_up.size()) {
aux_buffer_up.resize(nbytes);
}
if (nbytes > aux_buffer_gate.size()) {
aux_buffer_gate.resize(nbytes);
}
LLAMA_LOG_INFO("%s: repacking up/gate experts bias in layer %d\n", __func__, il);
ggml_backend_tensor_get(l.ffn_up_exps_b, aux_buffer_up.data(), 0, nbytes);
ggml_backend_tensor_get(l.ffn_gate_exps_b, aux_buffer_gate.data(), 0, nbytes);
if (aux_buffer_up_gate.size() < 2*nbytes) {
aux_buffer_up_gate.resize(2*nbytes);
}
offset_up_gate = 0;
offset_up = 0;
expert_size = l.ffn_up_exps_b->nb[1];
for (int i1 = 0; i1 < (int)l.ffn_up_gate_exps_b->ne[1]; ++i1) {
std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_gate.data() + offset_up, expert_size);
offset_up_gate += expert_size;
std::memcpy(aux_buffer_up_gate.data() + offset_up_gate, aux_buffer_up.data() + offset_up, expert_size);
offset_up_gate += expert_size;
offset_up += expert_size;
}
ggml_backend_tensor_set(l.ffn_up_gate_exps_b, aux_buffer_up_gate.data(), 0, 2*expert_size*l.ffn_up_gate_exps_b->ne[1]);
}
}
}
}
struct llama_context * llama_init_from_model(
struct llama_model * model,
struct llama_context_params params) {
if (!model) {
LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
return nullptr;
}
if (params.n_batch == 0 && params.n_ubatch == 0) {
LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
return nullptr;
}
if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
return nullptr;
}
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
params.flash_attn = false;
}
if (params.type_v != GGML_TYPE_F16 && params.type_v != GGML_TYPE_BF16 && !params.flash_attn) {
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
return nullptr;
}
if (params.k_cache_hadamard && !ggml_is_quantized(params.type_k)) {
LLAMA_LOG_WARN("%s: there is no point in Hadamard transforms with not quantized K-cache. Turning K-cache Hadamard off\n", __func__);
params.k_cache_hadamard = false;
}
if (params.k_cache_hadamard) {
int nok = 0;
for (int il = 0; il < model->hparams.n_layer; ++il) {
if (model->hadamard_size_k(il) > 0) ++nok;
}
if (nok == 0) {
LLAMA_LOG_WARN("%s: no layer allows a K-head Hadamard transform. Turning K-cache Hadamard off\n", __func__);
params.k_cache_hadamard = false;
}
}
if (params.v_cache_hadamard && !ggml_is_quantized(params.type_v)) {
LLAMA_LOG_WARN("%s: there is no point in Hadamard transforms with not quantized V-cache. Turning V-cache Hadamard off\n", __func__);
params.v_cache_hadamard = false;
}
if (params.v_cache_hadamard) {
int nok = 0;
for (int il = 0; il < model->hparams.n_layer; ++il) {
if (model->hadamard_size_v(il) > 0) ++nok;
}
if (nok == 0) {
LLAMA_LOG_WARN("%s: no layer allows a V-head Hadamard transform. Turning V-cache Hadamard off\n", __func__);
params.v_cache_hadamard = false;
}
}
llama_context * ctx = new llama_context(*model);
for (int i : model->devices) {
ggml_backend_buffer_type_t buft = llama_default_buffer_type_offload(*model, i);
const char* name = ggml_backend_buft_name(buft);
std::string device(name);
ctx->cparams.devices.push_back(device);
}
const auto & hparams = model->hparams;
auto & cparams = ctx->cparams;
cparams.n_seq_max = std::max(1u, params.n_seq_max);
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch;
cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv;
cparams.flash_attn = params.flash_attn;
cparams.mla_attn = params.mla_attn;
cparams.attn_max_batch = params.attn_max_batch;
cparams.fused_moe_up_gate= params.fused_moe_up_gate;
cparams.grouped_expert_routing = params.grouped_expert_routing;
cparams.fused_up_gate = params.fused_up_gate;
cparams.fused_mmad = params.fused_mmad;
cparams.rope_cache = params.rope_cache;
cparams.graph_reuse = params.graph_reuse;
cparams.k_cache_hadamard = params.k_cache_hadamard;
cparams.v_cache_hadamard = params.v_cache_hadamard;
if (model->khad_pretransformed && !cparams.k_cache_hadamard) {
LLAMA_LOG_WARN("%s: model has Hadamard-folded wv_b/wk_b_pp; forcing k_cache_hadamard=true\n", __func__);
cparams.k_cache_hadamard = true;
}
if (cparams.k_cache_hadamard) {
llm_apply_khad_pretransform(*model);
}
cparams.split_mode_graph_scheduling = params.split_mode_graph_scheduling;
cparams.scheduler_async = params.scheduler_async;
cparams.min_experts = params.min_experts;
cparams.thresh_experts = params.thresh_experts;
cparams.cuda_params = params.cuda_params;
cparams.mtp = params.mtp;
cparams.worst_graph_tokens = params.worst_case_tokens;
cparams.reduce_type = params.type_reduce;
cparams.graph_attn_precision = params.type_graph_attn;
if (cparams.graph_attn_precision != GGML_TYPE_F16 && cparams.graph_attn_precision != GGML_TYPE_F32) {
throw std::runtime_error(format("--graph-attn-precision must be f16 or f32, got %s",
ggml_type_name(cparams.graph_attn_precision)));
}
cparams.pooling_type = params.pooling_type;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
cparams.n_batch = GGML_KQ_MASK_PAD;
}
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
hparams.n_ctx_train;
cparams.cb_eval = params.cb_eval;
cparams.cb_eval_user_data = params.cb_eval_user_data;
auto rope_scaling_type = params.rope_scaling_type;
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
rope_scaling_type = hparams.rope_scaling_type_train;
}
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
cparams.rope_freq_scale = 1.0f; }
if (cparams.yarn_ext_factor < 0.0f) { cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
}
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
} else {
cparams.pooling_type = hparams.pooling_type;
}
}
if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
cparams.causal_attn = hparams.causal_attn;
} else {
cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
}
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
if (!model->is_mla_model() && cparams.mla_attn != 0) {
cparams.mla_attn = 0;
} else {
if (model->n_gpu_layers > 0 && model->n_gpu_layers < model->hparams.n_layer && cparams.mla_attn != 3) {
LLAMA_LOG_WARN("=============================================================================\n");
LLAMA_LOG_WARN("MLA models with ngl < n_layer and split mode graph do not work with mla = %d\n", cparams.mla_attn);
LLAMA_LOG_WARN(" => changing mla to 3\n");
LLAMA_LOG_WARN("=============================================================================\n");
cparams.mla_attn = 3;
}
}
if (model->arch == LLM_ARCH_OPENAI_MOE && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
if (cparams.reduce_type == GGML_TYPE_F16) {
LLAMA_LOG_WARN("=====================================================================\n");
LLAMA_LOG_WARN("GPT-OSS with split mode graph requires f32 precision\n");
LLAMA_LOG_WARN(" => changing cparams.reduce_type to f32\n");
LLAMA_LOG_WARN("=====================================================================\n");
cparams.reduce_type = GGML_TYPE_F32;
}
}
if ((model->arch == LLM_ARCH_MELLUM || model->arch == LLM_ARCH_LAGUNA) && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
if (cparams.reduce_type == GGML_TYPE_F16) {
const char * mname = model->arch == LLM_ARCH_MELLUM ? "Mellum" : "Laguna";
LLAMA_LOG_WARN("=====================================================================\n");
LLAMA_LOG_WARN("%s with split mode graph requires bf16 or f32 precision\n", mname);
LLAMA_LOG_WARN(" => changing cparams.reduce_type to bf16\n");
LLAMA_LOG_WARN("=====================================================================\n");
cparams.reduce_type = GGML_TYPE_BF16;
}
}
if (model->arch != LLM_ARCH_GLM4_MOE && model->arch != LLM_ARCH_QWEN35 &&
model->arch != LLM_ARCH_QWEN35MOE && model->arch != LLM_ARCH_GEMMA4 &&
model->arch != LLM_ARCH_GEMMA4_MTP && model->arch != LLM_ARCH_GLM_DSA &&
model->arch != LLM_ARCH_GEMMA4_ASSISTANT &&
cparams.mtp != 0) {
cparams.mtp = 0;
}
cparams.mtp_op_type = params.mtp_op_type;
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
if (model->is_mla_model()) {
LLAMA_LOG_INFO("%s: mla_attn = %d\n", __func__, cparams.mla_attn);
}
LLAMA_LOG_INFO("%s: attn_max_b = %d\n", __func__, cparams.attn_max_batch);
LLAMA_LOG_INFO("%s: fused_moe = %d\n", __func__, cparams.fused_moe_up_gate);
LLAMA_LOG_INFO("%s: grouped er = %d\n", __func__, cparams.grouped_expert_routing);
LLAMA_LOG_INFO("%s: fused_up_gate = %d\n", __func__, cparams.fused_up_gate);
LLAMA_LOG_INFO("%s: fused_mmad = %d\n", __func__, cparams.fused_mmad);
LLAMA_LOG_INFO("%s: rope_cache = %d\n", __func__, cparams.rope_cache);
LLAMA_LOG_INFO("%s: graph_reuse = %d\n", __func__, cparams.graph_reuse);
LLAMA_LOG_INFO("%s: k_cache_hadam = %d\n", __func__, cparams.k_cache_hadamard);
LLAMA_LOG_INFO("%s: v_cache_hadam = %d\n", __func__, cparams.v_cache_hadamard);
LLAMA_LOG_INFO("%s: split_mode_graph_scheduling = %d\n", __func__, cparams.split_mode_graph_scheduling);
LLAMA_LOG_INFO("%s: reduce_type = %s\n", __func__, ggml_type_name(cparams.reduce_type));
LLAMA_LOG_INFO("%s: sched_async = %d\n", __func__, cparams.scheduler_async);
LLAMA_LOG_INFO("%s: ser = %d, %g\n", __func__, cparams.min_experts, cparams.thresh_experts);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
if (cparams.cuda_params) {
LLAMA_LOG_INFO("%s: cuda_params = %s\n", __func__, (const char *)cparams.cuda_params);
}
ctx->abort_callback = params.abort_callback;
ctx->abort_callback_data = params.abort_callback_data;
ctx->sampling.rng = std::mt19937(params.seed);
ctx->logits_all = params.logits_all;
ctx->is_encoding = llama_model_has_encoder(model);
uint32_t kv_size = cparams.n_ctx;
ggml_type type_k = params.type_k;
ggml_type type_v = params.type_v;
if (model->arch == LLM_ARCH_MAMBA) {
kv_size = std::max((uint32_t) 1, params.n_seq_max);
type_k = GGML_TYPE_F32; type_v = GGML_TYPE_F32; }
GGML_ASSERT(hparams.n_embd_head_k(0) % ggml_blck_size(type_k) == 0);
GGML_ASSERT(hparams.n_embd_head_v(0) % ggml_blck_size(type_v) == 0);
if (!hparams.vocab_only) {
const int main_gpu_id = (model->main_gpu >= 0 && model->main_gpu < (int)model->devices.size())
? model->devices[model->main_gpu]
: model->main_gpu;
#if defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init();
if (ctx->backend_metal == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, ctx->backend_metal);
}
#elif defined(GGML_USE_CUDA)
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
ggml_backend_t backend = ggml_backend_cuda_init(main_gpu_id, cparams.cuda_params);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, main_gpu_id);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
} else {
auto params = cparams.cuda_params;
std::string new_params;
if (false && model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
static const std::string extra_string{"graphs=0"};
if (params) new_params = std::string{(const char *)params} + ',';
new_params += extra_string;
params = new_params.data();
}
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cuda_init(device, params);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
}
}
#elif defined(GGML_USE_VULKAN)
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH || model->split_mode == LLAMA_SPLIT_MODE_ATTN) {
LLAMA_LOG_ERROR("%s: split mode 'graph' or 'attn' not supported. Failed to initialize Vulkan backend\n", __func__);
llama_free(ctx);
return nullptr;
}
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
ggml_backend_t backend = ggml_backend_vk_init(main_gpu_id);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
} else {
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_vk_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
}
}
#elif defined(GGML_USE_SYCL)
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_id);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, main_gpu_id);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
ggml_backend_t backend = ggml_backend_sycl_init(i);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
}
}
#elif defined(GGML_USE_KOMPUTE)
if (model->n_gpu_layers > 0) {
auto * backend = ggml_backend_kompute_init(main_gpu_id);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
}
#elif defined(GGML_USE_CANN)
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_GRAPH) {
ggml_backend_t backend = ggml_backend_cann_init(main_gpu_id);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, main_gpu_id);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
} else {
for (int32_t device = 0; device < ggml_backend_cann_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cann_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CANN%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
}
}
#endif
#ifdef GGML_USE_BLAS
ctx->backend_blas = ggml_backend_blas_init();
if (ctx->backend_blas == nullptr) {
LLAMA_LOG_WARN("%s: failed to initialize BLAS backend\n", __func__);
} else {
ggml_backend_add_from_device(ctx, ctx->backend_blas);
}
#endif
#if defined(GGML_USE_RPC)
if (model->n_gpu_layers > 0) {
for (const auto & device : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(device.endpoint.c_str(), device.device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize RPC%d to '%s'\n", __func__, device.device,
device.endpoint.c_str());
llama_free(ctx);
return nullptr;
}
ggml_backend_add_from_device(ctx, backend);
}
}
#endif
if (ctx->cparams.devices.size()) {
std::vector<ggml_backend_t> backends = {};
std::vector<const char*> device_list = {};
for (auto device : ctx->cparams.devices) {
ggml_backend_t backend = ctx->ggml_backend_by_name(device.c_str());
if (backend) {
backends.push_back(backend);
}
}
ctx->backends = std::move(backends);
}
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(ctx->backend_cpu);
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv,
params.type_k_first, params.type_k_last, params.type_v_first, params.type_v_last,
params.n_k_first, params.n_k_last, params.n_v_first, params.n_v_last)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);
return nullptr;
}
{
size_t memory_size_k = 0;
size_t memory_size_v = 0;
for (auto & k : ctx->kv_self.k_l) {
if (k) {
memory_size_k += ggml_nbytes(k);
}
}
for (auto & v : ctx->kv_self.v_l) {
if (v) {
memory_size_v += ggml_nbytes(v);
}
}
if (memory_size_k + memory_size_v > 0) {
if (cparams.mla_attn != 0 && !cparams.flash_attn) {
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__,
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
} else if (cparams.mla_attn != 0 && cparams.flash_attn) {
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T: not used\n", __func__,
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f));
} else {
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
}
}
}
{
if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) {
LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__);
llama_free(ctx);
return nullptr;
}
LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
ggml_backend_buffer_name(ctx->buf_output),
ggml_backend_buffer_get_size(ctx->buf_output) / 1024.0 / 1024.0);
}
{
std::vector<ggml_backend_buffer_type_t> backend_buft;
for (auto * backend : ctx->backends) {
if (ggml_backend_is_cpu(backend)) {
backend_buft.push_back(llama_default_buffer_type_cpu(true));
} else {
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
}
}
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_ubatch);
const size_t max_nodes = ctx->max_nodes(n_tokens, cparams.n_ctx);
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
bool pipeline_parallel =
llama_get_device_count(*model) > 1 &&
model->n_gpu_layers > (int)model->hparams.n_layer &&
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
params.offload_kqv && !model->has_tensor_overrides();
#ifndef GGML_USE_CUDA
pipeline_parallel = false;
#endif
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, pipeline_parallel);
if (pipeline_parallel) {
LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(ctx->sched));
}
llama_repack_up_gate_exps(*ctx);
int n_past = cparams.n_ctx - n_tokens;
llama_token token = llama_token_bos(&ctx->model); ggml_cgraph * gf = llm_build_context::llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true, cparams.worst_graph_tokens);
bool gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
if (!gf_success)
{
if (pipeline_parallel) {
LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__);
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), max_nodes, false);
gf_success = ggml_backend_sched_reserve(ctx->sched, gf);
}
if (!gf_success) {
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
llama_free(ctx);
return nullptr;
}
}
for (size_t i = 0; i < ctx->backends.size(); i++) {
ggml_backend_t backend = ctx->backends[i];
ggml_backend_buffer_type_t buft = backend_buft[i];
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
if (size > 1) {
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buft_name(buft),
size / 1024.0 / 1024.0);
}
}
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
}
}
if (params.offload_policy) {
const std::vector<std::pair<int, int>>& policy = *(const std::vector<std::pair<int, int>>*)params.offload_policy;
for (auto [op, on_off] : policy) {
if (op < 0 || op >= int(GGML_OP_COUNT)) {
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for all ops to %s\n", on_off ? "ON" : "OFF");
} else {
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXX Setting offload policy for op %s to %s\n",
ggml_op_name(ggml_op(op)), on_off ? "ON" : "OFF");
}
ggml_backend_sched_set_op_offload(ctx->sched, ggml_op(op), on_off);
}
}
if (params.only_active_experts) {
LLAMA_LOG_INFO("%s: enabling only_active_experts scheduling\n", __func__);
ggml_backend_sched_set_only_active_experts(ctx->sched, true);
}
if (model->split_mode == LLAMA_SPLIT_MODE_GRAPH && (!model->has_tensor_overrides() || cparams.split_mode_graph_scheduling)) {
ggml_backend_sched_set_split_mode_graph(ctx->sched, true, cparams.scheduler_async);
ggml_backend_sched_set_max_extra_alloc(ctx->sched, params.max_extra_alloc);
if (model->has_tensor_overrides() && cparams.split_mode_graph_scheduling) {
LLAMA_LOG_INFO("XXXXXXXX Split Mode Graph Scheduling is FORCED despite tensor overrides due to user choice.\n");
LLAMA_LOG_INFO("XXXXXXXX It may or might NOT infer properly due to unsupported combinations between SMGS and every possible tensor overrides.\n");
}
}
if (cparams.mtp && hparams.nextn_predict_layers > 0) {
const auto n_batch = cparams.n_batch;
const auto n_vocab = hparams.n_vocab;
const auto n_embd = hparams.n_embd;
const size_t logits_size = n_vocab*n_batch;
const size_t embd_size = n_embd*n_batch;
if (ctx->output_ids.empty()) {
ctx->output_ids.resize(n_batch);
}
const size_t prev_size = ctx->buf_output ? ggml_backend_buffer_get_size(ctx->buf_output) : 0;
const size_t new_size = (logits_size + embd_size) * sizeof(float);
if (!ctx->buf_output || prev_size < new_size) {
if (ctx->buf_output) {
ggml_backend_buffer_free(ctx->buf_output);
ctx->buf_output = nullptr;
ctx->logits = nullptr;
ctx->embd = nullptr;
}
ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size);
if (ctx->buf_output == nullptr) {
LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
}
}
}
return ctx;
}
void llama_free(struct llama_context * ctx) {
delete ctx;
}
const struct llama_vocab* llama_model_get_vocab(const struct llama_model* model) {
return &model->vocab;
}
const struct llama_model * llama_get_model(const struct llama_context * ctx) {
return &ctx->model;
}
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx) {
return &ctx->model.vocab;
}
uint32_t llama_n_ctx(const struct llama_context * ctx) {
return ctx->cparams.n_ctx;
}
uint32_t llama_n_batch(const struct llama_context * ctx) {
return ctx->cparams.n_batch;
}
uint32_t llama_n_ubatch(const struct llama_context * ctx) {
return ctx->cparams.n_ubatch;
}
uint32_t llama_n_seq_max(const struct llama_context * ctx) {
return ctx->kv_self.size;
}
enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
return vocab->get_type();
}
const struct llama_vocab* llama_get_model_vocab(const struct llama_model* model) {
return &model->vocab;
}
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
switch (model->arch) {
case LLM_ARCH_GPT2:
case LLM_ARCH_GPTJ:
case LLM_ARCH_MPT:
case LLM_ARCH_REFACT:
case LLM_ARCH_BLOOM:
case LLM_ARCH_MAMBA:
case LLM_ARCH_JINA_BERT_V2:
case LLM_ARCH_T5:
case LLM_ARCH_T5ENCODER:
case LLM_ARCH_JAIS:
return LLAMA_ROPE_TYPE_NONE;
case LLM_ARCH_LLAMA:
case LLM_ARCH_DECI:
case LLM_ARCH_LLAMA4:
case LLM_ARCH_BAICHUAN:
case LLM_ARCH_STARCODER:
case LLM_ARCH_PLAMO:
case LLM_ARCH_ORION:
case LLM_ARCH_INTERNLM2:
case LLM_ARCH_MINICPM:
case LLM_ARCH_XVERSE:
case LLM_ARCH_COMMAND_R:
case LLM_ARCH_OLMO:
case LLM_ARCH_ARCTIC:
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_CHATGLM:
case LLM_ARCH_GLM4:
case LLM_ARCH_GRANITE:
case LLM_ARCH_GRANITE_MOE:
case LLM_ARCH_COHERE2:
case LLM_ARCH_COHERE2_MOE:
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
case LLM_ARCH_SMOLLM3:
case LLM_ARCH_MISTRAL3:
case LLM_ARCH_GLM_DSA:
case LLM_ARCH_MISTRAL4:
return LLAMA_ROPE_TYPE_NORM;
case LLM_ARCH_FALCON:
case LLM_ARCH_GROK:
case LLM_ARCH_DBRX:
case LLM_ARCH_BERT:
case LLM_ARCH_NOMIC_BERT:
case LLM_ARCH_STABLELM:
case LLM_ARCH_GLM4_MOE:
case LLM_ARCH_BITNET:
case LLM_ARCH_BITNET_25:
case LLM_ARCH_BITNET_B158:
case LLM_ARCH_QWEN:
case LLM_ARCH_QWEN2:
case LLM_ARCH_QWEN2MOE:
case LLM_ARCH_QWEN3:
case LLM_ARCH_QWEN3MOE:
case LLM_ARCH_MELLUM:
case LLM_ARCH_QWEN3NEXT:
case LLM_ARCH_PHI2:
case LLM_ARCH_PHI3:
case LLM_ARCH_GEMMA:
case LLM_ARCH_GEMMA2:
case LLM_ARCH_GEMMA3:
case LLM_ARCH_STARCODER2:
case LLM_ARCH_OPENELM:
case LLM_ARCH_GPTNEOX:
case LLM_ARCH_CODESHELL:
case LLM_ARCH_DOTS1:
case LLM_ARCH_HUNYUAN_MOE:
case LLM_ARCH_OPENAI_MOE:
case LLM_ARCH_BAILINGMOE2:
case LLM_ARCH_MINIMAX_M2:
case LLM_ARCH_MIMO2:
case LLM_ARCH_SEED_OSS:
case LLM_ARCH_STEP35:
case LLM_ARCH_LAGUNA:
case LLM_ARCH_GEMMA4:
case LLM_ARCH_GEMMA4_MTP:
case LLM_ARCH_GEMMA4_ASSISTANT:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:
return LLAMA_ROPE_TYPE_MROPE;
case LLM_ARCH_QWEN3VL:
case LLM_ARCH_QWEN3VLMOE:
case LLM_ARCH_QWEN35MOE:
case LLM_ARCH_QWEN35:
return LLAMA_ROPE_TYPE_IMROPE;
case LLM_ARCH_UNKNOWN:
GGML_ABORT("unknown architecture");
}
return LLAMA_ROPE_TYPE_NONE;
}
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
return ctx->cparams.pooling_type;
}
int32_t llama_n_vocab(const struct llama_model * model) {
return model->hparams.n_vocab;
}
int32_t llama_n_ctx_train(const struct llama_model * model) {
return model->hparams.n_ctx_train;
}
int32_t llama_model_n_embd(const struct llama_model * model) {
return model->hparams.n_embd;
}
int32_t llama_model_n_embd_inp(const llama_model* model) {
return model->hparams.n_embd_inp();
}
int32_t llama_n_layer(const struct llama_model * model) {
return model->hparams.n_layer;
}
float llama_rope_freq_scale_train(const struct llama_model * model) {
return model->hparams.rope_freq_scale_train;
}
int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
const auto & it = model->gguf_kv.find(key);
if (it == model->gguf_kv.end()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int32_t llama_model_meta_count(const struct llama_model * model) {
return (int)model->gguf_kv.size();
}
int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)model->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = model->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->first.c_str());
}
int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
if (i < 0 || i >= (int)model->gguf_kv.size()) {
if (buf_size > 0) {
buf[0] = '\0';
}
return -1;
}
auto it = model->gguf_kv.begin();
std::advance(it, i);
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
return snprintf(buf, buf_size, "%s %s %s",
llama_model_arch_name(model->arch),
llama_model_type_name(model->type),
llama_model_ftype_name(model->ftype).c_str());
}
uint64_t llama_model_size(const struct llama_model * model) {
uint64_t size = 0;
for (const auto & it : model->tensors_by_name) {
size += ggml_nbytes(it.second);
}
return size;
}
const char* llama_model_chat_template(const struct llama_model* model, const char* name) {
const auto key = name ? LLM_KV(model->arch, name)(LLM_KV_TOKENIZER_CHAT_TEMPLATE)
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
const auto& it = model->gguf_kv.find(key);
if (it == model->gguf_kv.end()) {
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
if (!name && pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
return "mistral-v7-tekken";
}
return nullptr;
}
return it->second.c_str();
}
uint64_t llama_model_n_params(const struct llama_model * model) {
uint64_t nparams = 0;
for (const auto & it : model->tensors_by_name) {
nparams += ggml_nelements(it.second);
}
return nparams;
}
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
return it.first == name;
});
if (it == model->tensors_by_name.end()) {
return nullptr;
}
return it->second;
}
bool llama_model_has_encoder(const struct llama_model * model) {
switch (model->arch) {
case LLM_ARCH_T5: return true;
case LLM_ARCH_T5ENCODER: return true;
default: return false;
}
}
bool llama_model_has_decoder(const struct llama_model * model) {
switch (model->arch) {
case LLM_ARCH_T5ENCODER: return false;
default: return true;
}
}
llama_token llama_model_decoder_start_token(const struct llama_model * model) {
return model->hparams.dec_start_token_id;
}
struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
try {
struct llama_lora_adapter * adapter = new llama_lora_adapter(model);
llama_lora_adapter_init_internal(model, path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
return nullptr;
}
}
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
GGML_ASSERT(cvec.tensors.empty());
GGML_ASSERT(cvec.ctxs.empty());
GGML_ASSERT(cvec.bufs.empty());
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
for (int64_t i = 0; i < model.hparams.n_layer; i++) {
buft_layer_count[model.buft_layer[i].buft]++;
}
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
for (auto & it : buft_layer_count) {
int n_layers = it.second;
struct ggml_init_params params = {
n_layers * ggml_tensor_overhead(),
NULL,
true,
};
ggml_context * ctx = ggml_init(params);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
return 1;
}
ctx_map[it.first] = ctx;
}
cvec.tensors.reserve(model.hparams.n_layer);
cvec.tensors.push_back(nullptr); for (size_t il = 1; il < model.hparams.n_layer; il++) {
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
cvec.tensors.push_back(tensor);
}
cvec.ctxs.reserve(ctx_map.size());
cvec.bufs.reserve(ctx_map.size());
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
return false;
}
ggml_backend_buffer_clear(buf, 0);
cvec.ctxs.push_back(ctx);
cvec.bufs.push_back(buf);
}
return true;
}
int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
const llama_model & model = lctx->model;
llama_control_vector & cvec = lctx->cvec;
if (data == nullptr) {
cvec.layer_start = -1;
cvec.layer_end = -1;
return 0;
}
if (n_embd != (int) model.hparams.n_embd) {
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
return 1;
}
if (cvec.tensors.empty()) {
if (!llama_control_vector_init(cvec, model)) {
return 1;
}
}
cvec.layer_start = il_start;
cvec.layer_end = il_end;
for (size_t il = 1; il < model.hparams.n_layer; il++) {
assert(cvec.tensors[il] != nullptr);
const size_t off = n_embd * (il - 1); if (off + n_embd <= len) {
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
}
}
return 0;
}
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
struct llama_kv_cache_view result = {
0,
n_seq_max,
0,
llama_get_kv_cache_used_cells(ctx),
0,
-1,
nullptr,
nullptr,
};
return result;
}
void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
if (view->cells != nullptr) {
free(view->cells);
view->cells = nullptr;
}
if (view->cells_sequences != nullptr) {
free(view->cells_sequences);
view->cells_sequences = nullptr;
}
}
void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
view->n_cells = int32_t(ctx->kv_self.size);
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
view->cells = (struct llama_kv_cache_view_cell *)p;
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_seq_max * view->n_cells);
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
view->cells_sequences = (llama_seq_id *)p;
}
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
llama_kv_cache_view_cell * c_curr = view->cells;
llama_seq_id * cs_curr = view->cells_sequences;
int32_t used_cells = 0;
int32_t token_count = 0;
int32_t curr_contig_idx = -1;
uint32_t max_contig = 0;
int32_t max_contig_idx = -1;
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_seq_max) {
const size_t curr_size = kv_cells[i].seq_id.size();
token_count += curr_size;
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
if (curr_size > 0) {
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
max_contig = i - curr_contig_idx;
max_contig_idx = curr_contig_idx;
}
curr_contig_idx = -1;
} else if (curr_contig_idx < 0) {
curr_contig_idx = i;
}
int seq_idx = 0;
for (const llama_seq_id it : kv_cells[i].seq_id) {
if (seq_idx >= view->n_seq_max) {
break;
}
cs_curr[seq_idx] = it;
seq_idx++;
}
if (seq_idx != 0) {
used_cells++;
}
for (; seq_idx < view->n_seq_max; seq_idx++) {
cs_curr[seq_idx] = -1;
}
}
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
max_contig_idx = curr_contig_idx;
max_contig = kv_cells.size() - curr_contig_idx;
}
view->max_contiguous = max_contig;
view->max_contiguous_idx = max_contig_idx;
view->token_count = token_count;
view->used_cells = used_cells;
if (uint32_t(used_cells) != ctx->kv_self.used) {
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
__func__, ctx->kv_self.used, used_cells);
}
}
int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx) {
int result = 0;
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
result += ctx->kv_self.cells[i].seq_id.size();
}
return result;
}
int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
return ctx->kv_self.used;
}
void llama_kv_cache_clear(struct llama_context * ctx) {
llama_kv_cache_clear(ctx->kv_self);
}
static bool spec_ckpt_try_per_step(llama_kv_cache & kv, const llama_model & model, int max_tokens) {
bool has_gpu = false;
bool has_cpu = false;
for (const auto * sl : kv.s_l) {
if (!sl) continue;
if (sl->extra) {
has_gpu = true;
continue;
}
if (sl->buffer && !ggml_backend_buffer_is_host(sl->buffer)) {
has_gpu = true;
} else if (sl->buffer) {
has_cpu = true;
}
}
if (!has_gpu && !has_cpu) {
kv.save_per_step_ssm = false;
return false;
}
if (kv.ckpt.per_step_ssm_state_size <= 0) {
const auto & hp = model.hparams;
const int64_t nv = hp.ssm_dt_rank;
const int64_t head_v = hp.ssm_d_inner / nv;
const int64_t head_k = hp.ssm_d_state;
const int64_t nk = hp.ssm_n_group;
const int64_t key_dim = head_k * nk;
const int64_t val_dim = head_v * nv;
const int64_t conv_dim = key_dim * 2 + val_dim;
kv.ckpt.per_step_ssm_state_size = head_v * head_v * nv;
kv.ckpt.per_step_conv_state_dim = (hp.ssm_d_conv - 1) * conv_dim;
kv.ckpt.per_step_conv_dim = conv_dim;
kv.ckpt.per_step_d_conv = hp.ssm_d_conv;
}
if (!kv.per_step_alloc(model, max_tokens)) {
kv.save_per_step_ssm = false;
return false;
}
if (!kv.checkpoint_alloc_shadows(true)) {
LLAMA_LOG_ERROR("%s: failed to allocate conv-state shadow buffers for per-step checkpoints\n", __func__);
kv.save_per_step_ssm = false;
return false;
}
return true;
}
static size_t llama_spec_ckpt_cpu_state_reserve(const llama_context * ctx, llama_seq_id seq_id) {
const auto & kv_self = ctx->kv_self;
size_t size = sizeof(uint32_t);
if (seq_id >= 0 && llama_kv_qnext_seq_id_in_range(kv_self, seq_id) && (uint32_t) seq_id < kv_self.size) {
size += sizeof(llama_pos);
size += sizeof(uint32_t); }
const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0;
const uint32_t n_layer = kv_self.k_l.size();
size += sizeof(v_state);
size += sizeof(n_layer);
size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint64_t));
if (v_state == 0) {
size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint64_t));
} else if (v_state == 1) {
size += (size_t) n_layer * (sizeof(int32_t) + sizeof(uint32_t) + sizeof(uint32_t));
}
const uint32_t qnext_state = llama_kv_has_qnext_state_storage(kv_self) ? 1 : 0;
size += sizeof(qnext_state);
if (qnext_state != 0) {
for (uint32_t il = 0; il < n_layer; ++il) {
const bool has_s_cache = il < kv_self.s_l.size() && kv_self.s_l[il] != nullptr;
const uint64_t s_size_row = has_s_cache ? ggml_row_size(kv_self.s_l[il]->type, kv_self.s_l[il]->ne[0]) : 0;
const uint32_t s_rows = has_s_cache && seq_id >= 0 && llama_kv_qnext_seq_id_in_range(kv_self, seq_id) && (uint32_t) seq_id < kv_self.size
? 1
: 0;
size += sizeof(int32_t);
size += sizeof(uint64_t);
size += sizeof(uint32_t);
size += (size_t) s_rows * s_size_row;
}
}
return size;
}
static const char * llama_spec_ckpt_mode_name(int mode) {
switch (mode) {
case LLAMA_SPEC_CKPT_PER_STEP:
return "per-step";
case LLAMA_SPEC_CKPT_GPU_FALLBACK:
return "gpu-fallback";
case LLAMA_SPEC_CKPT_CPU:
return "cpu";
case LLAMA_SPEC_CKPT_AUTO:
return "auto";
default:
return "none";
}
}
int llama_spec_ckpt_init(struct llama_context * ctx, int mode, int max_tokens) {
auto & kv = ctx->kv_self;
kv.save_per_step_ssm = false;
kv.ckpt.selected_spec_mode = LLAMA_SPEC_CKPT_NONE;
if (!kv.checkpoint_supported()) {
return (int)LLAMA_SPEC_CKPT_NONE;
}
if (kv.ckpt.fixed_spec_mode != LLAMA_SPEC_CKPT_NONE) {
if (kv.ckpt.fixed_spec_mode == LLAMA_SPEC_CKPT_PER_STEP && max_tokens > kv.ckpt.fixed_max_tokens) {
LLAMA_LOG_WARN("%s: fixed per-step checkpoint capacity is %d tokens, but the current speculative batch requests %d; disabling checkpoint for this batch\n",
__func__, kv.ckpt.fixed_max_tokens, max_tokens);
return (int)LLAMA_SPEC_CKPT_NONE;
}
kv.ckpt.selected_spec_mode = kv.ckpt.fixed_spec_mode;
return kv.ckpt.selected_spec_mode;
}
int requested = mode;
int resolved = LLAMA_SPEC_CKPT_NONE;
if (requested == LLAMA_SPEC_CKPT_AUTO) {
requested = LLAMA_SPEC_CKPT_PER_STEP;
}
if (requested == LLAMA_SPEC_CKPT_PER_STEP) {
if (spec_ckpt_try_per_step(kv, ctx->model, max_tokens)) {
resolved = LLAMA_SPEC_CKPT_PER_STEP;
} else if (mode == LLAMA_SPEC_CKPT_PER_STEP) {
LLAMA_LOG_ERROR("%s: failed to preallocate per-step checkpoint buffers for max_tokens=%d; --recurrent-ckpt-mode=%s requires startup allocation\n",
__func__, max_tokens, llama_spec_ckpt_mode_name(mode));
return (int)LLAMA_SPEC_CKPT_NONE;
} else {
LLAMA_LOG_WARN("%s: auto checkpoint mode could not preallocate per-step buffers for max_tokens=%d; falling back to gpu-fallback\n",
__func__, max_tokens);
requested = LLAMA_SPEC_CKPT_GPU_FALLBACK;
}
}
if (resolved == LLAMA_SPEC_CKPT_NONE && requested == LLAMA_SPEC_CKPT_GPU_FALLBACK) {
if (kv.checkpoint_alloc_shadows()) {
resolved = LLAMA_SPEC_CKPT_GPU_FALLBACK;
} else if (mode == LLAMA_SPEC_CKPT_GPU_FALLBACK) {
LLAMA_LOG_ERROR("%s: failed to preallocate gpu-fallback checkpoint shadows at startup; --recurrent-ckpt-mode=%s requires startup allocation\n",
__func__, llama_spec_ckpt_mode_name(mode));
return (int)LLAMA_SPEC_CKPT_NONE;
} else {
LLAMA_LOG_WARN("%s: auto checkpoint mode could not preallocate gpu-fallback checkpoint shadows; falling back to cpu\n",
__func__);
requested = LLAMA_SPEC_CKPT_CPU;
}
}
if (resolved == LLAMA_SPEC_CKPT_NONE) {
resolved = LLAMA_SPEC_CKPT_CPU;
}
if (resolved == LLAMA_SPEC_CKPT_CPU) {
const size_t cpu_reserve = llama_spec_ckpt_cpu_state_reserve(ctx, 0);
kv.ckpt.cpu_state_data.clear();
kv.ckpt.cpu_state_data.reserve(cpu_reserve);
LLAMA_LOG_INFO("%s: CPU serialized checkpoint reserve = %8.2f MiB (per seq)\n",
__func__, cpu_reserve / 1024.0 / 1024.0);
}
kv.ckpt.fixed_spec_mode = resolved;
kv.ckpt.fixed_max_tokens = resolved == LLAMA_SPEC_CKPT_PER_STEP ? max_tokens : 0;
kv.ckpt.selected_spec_mode = resolved;
LLAMA_LOG_INFO("%s: fixed recurrent checkpoint mode = %s%s\n",
__func__, llama_spec_ckpt_mode_name(resolved),
resolved == LLAMA_SPEC_CKPT_PER_STEP ? (std::string(" (max_tokens=") + std::to_string(max_tokens) + ")").c_str() : "");
return resolved;
}
bool llama_spec_ckpt_save(struct llama_context * ctx, llama_seq_id seq_id) {
auto & kv = ctx->kv_self;
switch (kv.ckpt.selected_spec_mode) {
case LLAMA_SPEC_CKPT_PER_STEP:
kv.save_per_step_ssm = true;
return true;
case LLAMA_SPEC_CKPT_GPU_FALLBACK:
return kv.checkpoint_save(ctx->sched);
case LLAMA_SPEC_CKPT_CPU: {
const size_t need = llama_state_seq_get_size(ctx, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
kv.ckpt.cpu_state_data.resize(need);
const size_t written = llama_state_seq_get_data(
ctx, kv.ckpt.cpu_state_data.data(), need, seq_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
kv.ckpt.cpu_state_data.resize(written);
return written > 0;
}
default:
return false;
}
}
bool llama_spec_ckpt_restore(struct llama_context * ctx, llama_seq_id seq_id,
llama_pos n_past, int accepted_step) {
auto & kv = ctx->kv_self;
switch (kv.ckpt.selected_spec_mode) {
case LLAMA_SPEC_CKPT_PER_STEP: {
if (!kv.per_step_restore(ctx->model, ctx->sched, accepted_step)) {
return false;
}
const llama_pos accepted_pos = n_past + accepted_step;
if (seq_id >= 0 && (uint32_t)seq_id < kv.size) {
kv.cells[seq_id].pos = accepted_pos;
}
llama_kv_cache_seq_rm(kv, seq_id, accepted_pos + 1, -1);
return true;
}
case LLAMA_SPEC_CKPT_GPU_FALLBACK:
kv.checkpoint_restore(ctx->sched);
llama_kv_cache_seq_rm(kv, seq_id, n_past, -1);
return false;
case LLAMA_SPEC_CKPT_CPU:
if (!kv.ckpt.cpu_state_data.empty()) {
llama_state_seq_set_data(ctx, kv.ckpt.cpu_state_data.data(),
kv.ckpt.cpu_state_data.size(), seq_id,
LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
}
llama_kv_cache_seq_rm(kv, seq_id, n_past, -1);
return false;
default:
return false;
}
}
void llama_spec_ckpt_discard(struct llama_context * ctx) {
auto & kv = ctx->kv_self;
if (kv.ckpt.selected_spec_mode == LLAMA_SPEC_CKPT_PER_STEP) {
kv.save_per_step_ssm = false;
kv.checkpoint_delete();
} else if (kv.ckpt.selected_spec_mode == LLAMA_SPEC_CKPT_GPU_FALLBACK) {
kv.checkpoint_delete();
}
kv.ckpt.selected_spec_mode = LLAMA_SPEC_CKPT_NONE;
kv.ckpt.cpu_state_data.clear();
}
bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
}
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
if (seq_id_src == seq_id_dst) {
return;
}
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
}
void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
}
void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
if (delta == 0) {
return;
}
llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
}
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
if (d == 1) {
return;
}
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
}
llama_pos llama_kv_cache_seq_pos_min(struct llama_context * ctx, llama_seq_id seq_id) {
if (ctx->kv_self.hybrid || ctx->kv_self.recurrent) {
return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
}
return llama_kv_cache_seq_pos_min(ctx->kv_self, seq_id);
}
llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
}
void llama_kv_cache_defrag(struct llama_context * ctx) {
llama_kv_cache_defrag(ctx->kv_self);
}
int32_t llama_kv_cache_update(struct llama_context * ctx) {
return llama_kv_cache_update_internal(*ctx);
}
size_t llama_get_state_size(struct llama_context * ctx) {
return llama_state_get_size(ctx);
}
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
return llama_state_get_data(ctx, dst, -1);
}
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
return llama_state_set_data(ctx, src, -1);
}
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
}
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
return llama_state_save_file(ctx, path_session, tokens, n_token_count);
}
static inline ggml_tensor * get_kv_cache_split_tensor(const ggml_tensor * tensor, const llama_layer & l) {
if (!l.wv) return l.wk;
bool use_V_for_K = l.attn_k_norm && l.attn_k_norm->ne[0] == l.wk->ne[1] ? true : false;
auto kv = tensor->ne[1] > 1 && !use_V_for_K ? l.wk : l.wv;
return kv;
}
struct llama_data_write {
virtual void write(const void * src, size_t size) = 0;
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size, int il) = 0;
virtual size_t get_size_written() = 0;
virtual ~llama_data_write() = default;
void write_string(const std::string & str) {
uint32_t str_size = str.size();
write(&str_size, sizeof(str_size));
write(str.data(), str_size);
}
void write_model_info(const struct llama_context * ctx) {
std::string arch_str = llama_model_arch_name(ctx->model.arch);
write_string(arch_str);
}
void write_rng(const std::mt19937 & rng) {
std::ostringstream rng_ss;
rng_ss << rng;
const std::string & rng_str = rng_ss.str();
write_string(rng_str);
}
void write_output_ids(const struct llama_context * ctx) {
const uint32_t n_outputs = ctx->n_outputs;
std::vector<int32_t> output_pos;
const size_t n_batch = ctx->cparams.n_batch;
const auto & output_ids = ctx->output_ids;
GGML_ASSERT(n_outputs <= ctx->output_size);
output_pos.resize(n_outputs);
for (size_t i = 0; i < n_batch; ++i) {
int32_t pos = output_ids[i];
if (pos >= 0) {
GGML_ASSERT((uint32_t) pos < n_outputs);
output_pos[pos] = i;
}
}
write(&n_outputs, sizeof(n_outputs));
if (n_outputs) {
write(output_pos.data(), n_outputs * sizeof(int32_t));
}
}
void write_logits(const struct llama_context * ctx) {
const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_vocab);
write(&logits_size, sizeof(logits_size));
if (logits_size) {
write(ctx->logits, logits_size * sizeof(float));
}
}
void write_embeddings(const struct llama_context * ctx) {
const uint64_t row_width = llama_output_embd_width(*ctx);
const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs_embd * row_width);
write(&embeddings_size, sizeof(embeddings_size));
if (embeddings_size) {
write(ctx->embd, embeddings_size * sizeof(float));
}
}
void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
for (const auto & range : cell_ranges) {
for (uint32_t i = range.first; i < range.second; ++i) {
const auto & cell = kv_self.cells[i];
const llama_pos pos = cell.pos;
const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
write(&pos, sizeof(pos));
write(&n_seq_id, sizeof(n_seq_id));
if (n_seq_id) {
for (auto seq_id : cell.seq_id) {
write(&seq_id, sizeof(seq_id));
}
}
}
}
}
void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1,
llama_state_seq_flags flags = 0) {
const struct llama_kv_cache & kv_self = ctx->kv_self;
const struct llama_hparams & hparams = ctx->model.hparams;
bool need_kv = (flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0;
const uint32_t v_state = kv_self.v_l.empty() ? 2 : kv_self.v_trans ? 1 : 0;
const uint32_t n_layer = kv_self.k_l.size();
write(&v_state, sizeof(v_state));
write(&n_layer, sizeof(n_layer));
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
const bool has_k_cache = kv_self.k_l[il] != nullptr && need_kv;
const int32_t k_type_i = has_k_cache ? (int32_t) kv_self.k_l[il]->type : -1;
write(&k_type_i, sizeof(k_type_i));
const uint64_t k_size_row = has_k_cache
? ((ctx->cparams.mla_attn == 0)
? ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa)
: ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope))
: 0;
write(&k_size_row, sizeof(k_size_row));
if (!has_k_cache) {
continue;
}
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * k_size_row;
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size, il);
}
}
if (v_state == 0) {
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = llama_kv_v_row_embd(ctx->model, hparams, il);
const bool has_v_cache = kv_self.v_l[il] != nullptr && need_kv;
const int32_t v_type_i = has_v_cache ? (int32_t) kv_self.v_l[il]->type : -1;
write(&v_type_i, sizeof(v_type_i));
const uint64_t v_size_row = has_v_cache ? ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa) : 0;
write(&v_size_row, sizeof(v_size_row));
if (!has_v_cache) {
continue;
}
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t buf_size = range_size * v_size_row;
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size, il);
}
}
}
else if (v_state == 1) {
const uint32_t kv_size = kv_self.size;
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = llama_kv_v_row_embd(ctx->model, hparams, il);
const bool has_v_cache = kv_self.v_l[il] != nullptr && need_kv;
const int32_t v_type_i = has_v_cache ? (int32_t) kv_self.v_l[il]->type : -1;
write(&v_type_i, sizeof(v_type_i));
const uint32_t v_size_el = has_v_cache ? ggml_type_size(kv_self.v_l[il]->type) : 0;
write(&v_size_el, sizeof(v_size_el));
const uint32_t n_embd_v_gqa_write = has_v_cache ? n_embd_v_gqa : 0;
write(&n_embd_v_gqa_write, sizeof(n_embd_v_gqa_write));
if (!has_v_cache) {
continue;
}
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
const size_t buf_size = range_size * v_size_el;
write_tensor_data(kv_self.v_l[il], src_offset, buf_size, il);
}
}
}
}
const uint32_t qnext_state = llama_kv_has_qnext_state_storage(kv_self) ? 1 : 0;
write(&qnext_state, sizeof(qnext_state));
if (qnext_state != 0) {
for (uint32_t il = 0; il < n_layer; ++il) {
const bool has_s_cache = il < kv_self.s_l.size() && kv_self.s_l[il] != nullptr;
const int32_t s_type_i = has_s_cache ? (int32_t) kv_self.s_l[il]->type : -1;
write(&s_type_i, sizeof(s_type_i));
const uint64_t s_size_row = has_s_cache ? ggml_row_size(kv_self.s_l[il]->type, kv_self.s_l[il]->ne[0]) : 0;
write(&s_size_row, sizeof(s_size_row));
uint32_t s_rows = 0;
size_t s_offset = 0;
if (has_s_cache) {
const uint32_t n_slots = (uint32_t) kv_self.s_l[il]->ne[1];
if (seq_id == -1) {
s_rows = n_slots;
} else if (llama_kv_qnext_seq_id_in_range(kv_self, seq_id) && (uint32_t) seq_id < kv_self.size) {
llama_seq_id src_seq_id = kv_self.cells[seq_id].src;
if (llama_kv_qnext_seq_id_in_range(kv_self, src_seq_id)) {
s_rows = 1;
s_offset = (size_t) src_seq_id * s_size_row;
}
}
}
write(&s_rows, sizeof(s_rows));
if (has_s_cache && s_rows > 0) {
write_tensor_data(kv_self.s_l[il], s_offset, s_rows * s_size_row, il);
}
}
}
}
void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) {
const struct llama_kv_cache & kv_self = ctx->kv_self;
std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; uint32_t cell_count = 0;
uint32_t cell_range_begin = kv_self.size;
for (uint32_t i = 0; i < kv_self.size; ++i) {
const auto & cell = kv_self.cells[i];
if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
++cell_count;
if (cell_range_begin == kv_self.size) {
cell_range_begin = i;
}
} else {
if (cell_range_begin != kv_self.size) {
cell_ranges.emplace_back(cell_range_begin, i);
cell_range_begin = kv_self.size;
}
}
}
if (cell_range_begin != kv_self.size) {
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
}
uint32_t cell_count_check = 0;
for (const auto & range : cell_ranges) {
cell_count_check += range.second - range.first;
}
GGML_ASSERT(cell_count == cell_count_check);
write(&cell_count, sizeof(cell_count));
write_kv_cache_meta(kv_self, cell_ranges, seq_id);
write_kv_cache_data(ctx, cell_ranges, seq_id, flags);
}
};
struct llama_data_read {
virtual const uint8_t * read(size_t size) = 0;
virtual void read_to(void * dst, size_t size) = 0;
virtual size_t get_size_read() = 0;
virtual ~llama_data_read() = default;
void read_string(std::string & str) {
uint32_t str_size;
read_to(&str_size, sizeof(str_size));
str.assign((const char *) read(str_size), str_size);
}
void read_model_info(const struct llama_context * ctx) {
std::string cur_arch_str = llama_model_arch_name(ctx->model.arch);
std::string arch_str;
read_string(arch_str);
if (cur_arch_str != arch_str) {
throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
}
}
void read_rng(std::mt19937 & rng) {
std::string rng_str;
read_string(rng_str);
std::istringstream rng_ss(rng_str);
rng_ss >> rng;
if (rng_ss.fail()) {
throw std::runtime_error("failed to load RNG state");
}
}
void read_output_ids(struct llama_context * ctx) {
std::vector<int32_t> output_pos;
uint32_t n_outputs;
read_to(&n_outputs, sizeof(n_outputs));
if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
throw std::runtime_error("could not reserve outputs");
}
if (n_outputs) {
output_pos.resize(n_outputs);
read_to(output_pos.data(), n_outputs * sizeof(int32_t));
for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
int32_t id = output_pos[i];
if ((uint32_t) id >= ctx->cparams.n_batch) {
throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
}
ctx->output_ids[id] = i;
}
ctx->n_outputs = n_outputs;
}
}
void read_logits(struct llama_context * ctx) {
uint64_t logits_size;
read_to(&logits_size, sizeof(logits_size));
if (ctx->logits_size < logits_size) {
throw std::runtime_error("logits buffer too small");
}
if (logits_size) {
read_to(ctx->logits, logits_size * sizeof(float));
}
}
void read_embeddings(struct llama_context * ctx) {
uint64_t embeddings_size;
read_to(&embeddings_size, sizeof(embeddings_size));
if (ctx->embd_size < embeddings_size) {
throw std::runtime_error("embeddings buffer too small");
}
const uint64_t row_width = llama_output_embd_width(*ctx);
if (row_width == 0 || (embeddings_size % row_width) != 0) {
throw std::runtime_error("invalid embeddings payload size");
}
ctx->n_outputs_embd = embeddings_size / row_width;
if (embeddings_size) {
read_to(ctx->embd, embeddings_size * sizeof(float));
}
}
bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
struct llama_kv_cache & kv_self = ctx->kv_self;
if (dest_seq_id != -1) {
llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
llama_batch batch = llama_batch_init(cell_count, 0, 1);
batch.n_tokens = cell_count;
for (uint32_t i = 0; i < cell_count; ++i) {
llama_pos pos;
uint32_t n_seq_id;
read_to(&pos, sizeof(pos));
read_to(&n_seq_id, sizeof(n_seq_id));
if (n_seq_id != 0) {
llama_batch_free(batch);
LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
return false;
}
batch.pos[i] = pos;
batch.n_seq_id[i] = 1;
batch.seq_id[i][0] = dest_seq_id;
}
if (!llama_kv_cache_find_slot(kv_self, batch)) {
llama_batch_free(batch);
LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
return false;
}
GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
llama_batch_free(batch);
} else {
if (cell_count > kv_self.size) {
LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
return false;
}
llama_kv_cache_clear(kv_self);
for (uint32_t i = 0; i < cell_count; ++i) {
llama_kv_cell & cell = kv_self.cells[i];
llama_pos pos;
uint32_t n_seq_id;
read_to(&pos, sizeof(pos));
read_to(&n_seq_id, sizeof(n_seq_id));
cell.pos = pos;
for (uint32_t j = 0; j < n_seq_id; ++j) {
llama_seq_id seq_id;
read_to(&seq_id, sizeof(seq_id));
if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
return false;
}
cell.seq_id.insert(seq_id);
}
}
kv_self.head = 0;
kv_self.used = cell_count;
}
return true;
}
void read_kv_cache_data_split(llama_context * ctx, ggml_tensor * tensor, const uint8_t * data, size_t head, size_t row_size, int nrows, int il) {
GGML_ASSERT(il >= 0 && il < int(ctx->model.layers.size()));
GGML_ASSERT(ggml_internal_get_type_traits(tensor->type).row_meta_size == 0);
std::vector<uint8_t> aux;
auto extra = (ggml_split_tensor_t *)tensor->extra;
if (ctx->model.is_mla_model()) {
GGML_ASSERT(extra);
GGML_ASSERT(ctx->cparams.mla_attn == 3);
for (int id = 0; id < extra->n_device; ++id) {
auto split = extra->splits[id];
if (!split) continue;
GGML_ASSERT(split->type == tensor->type);
ggml_backend_tensor_set(split, data, head*row_size, nrows*row_size);
}
return;
}
bool is_recurrent = ctx->model.hparams.recurrent_layer_arr[il];
auto kv = is_recurrent ? nullptr : get_kv_cache_split_tensor(tensor, ctx->model.layers[il]);
auto kv_extra = kv ? (ggml_split_tensor_t *)kv->extra : nullptr;
GGML_ASSERT(extra && (is_recurrent || kv_extra));
auto ne = kv ? kv->ne[1] : tensor->ne[0];
size_t sum_ne = 0;
size_t sum_split_row_size = 0;
GGML_ASSERT(row_size == ggml_row_size(tensor->type, ne));
for (int id = 0; id < extra->n_device; ++id) {
auto split = extra->splits[id];
auto kv_split = kv_extra ? kv_extra->splits[id] : nullptr;
GGML_ASSERT((split && (kv_split || is_recurrent)) || (!split && !kv_split));
if (!split) continue;
GGML_ASSERT(split->type == tensor->type);
auto ne_split = kv_split ? kv_split->ne[1] : split->ne[0];
auto split_row_size = ggml_row_size(tensor->type, ne_split);
aux.resize(split_row_size*nrows);
auto src = data + sum_split_row_size;
auto dst = aux.data();
for (int row = 0; row < nrows; ++row) {
std::memcpy(dst, src, split_row_size);
dst += split_row_size;
src += row_size;
}
ggml_backend_tensor_set(split, aux.data(), head*split_row_size, nrows*split_row_size);
sum_ne += ne_split;
sum_split_row_size += split_row_size;
}
GGML_ASSERT(sum_ne == ne);
GGML_ASSERT(sum_split_row_size == row_size);
}
bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) {
const struct llama_hparams & hparams = ctx->model.hparams;
struct llama_kv_cache & kv_self = ctx->kv_self;
bool need_kv = (flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0;
uint32_t v_state;
uint32_t n_layer;
read_to(&v_state, sizeof(v_state));
read_to(&n_layer, sizeof(n_layer));
if (n_layer != kv_self.k_l.size()) {
LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
return false;
}
if (cell_count > kv_self.size) {
LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
return false;
}
if (kv_self.v_trans != (v_state == 1)) {
LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
return false;
}
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
const bool has_k_cache = kv_self.k_l[il] != nullptr && need_kv;
int32_t k_type_i_ref;
read_to(&k_type_i_ref, sizeof(k_type_i_ref));
if (!has_k_cache) {
if (k_type_i_ref != -1) {
LLAMA_LOG_ERROR("%s: missing key cache for layer %d\n", __func__, il);
return false;
}
} else {
const int32_t k_type_i = (int32_t) kv_self.k_l[il]->type;
if (k_type_i != k_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
return false;
}
}
uint64_t k_size_row_ref;
read_to(&k_size_row_ref, sizeof(k_size_row_ref));
if (!has_k_cache) {
if (k_size_row_ref != 0) {
LLAMA_LOG_ERROR("%s: expected empty key row size for layer %d\n", __func__, il);
return false;
}
continue;
}
const size_t k_size_row = (ctx->cparams.mla_attn == 0) ? ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa) : ggml_row_size(kv_self.k_l[il]->type, kv_lora_rank + n_embd_head_qk_rope);
if (k_size_row != k_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
return false;
}
if (cell_count) {
if (kv_self.k_l[il]->extra) {
read_kv_cache_data_split(ctx, kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head, k_size_row, cell_count, il);
} else {
ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
}
}
}
if (v_state == 0) {
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = llama_kv_v_row_embd(ctx->model, hparams, il);
const bool has_v_cache = kv_self.v_l[il] != nullptr && need_kv;
int32_t v_type_i_ref;
read_to(&v_type_i_ref, sizeof(v_type_i_ref));
if (!has_v_cache) {
if (v_type_i_ref != -1) {
LLAMA_LOG_ERROR("%s: missing value cache for layer %d\n", __func__, il);
return false;
}
} else {
const int32_t v_type_i = (int32_t) kv_self.v_l[il]->type;
if (v_type_i != v_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
return false;
}
}
uint64_t v_size_row_ref;
read_to(&v_size_row_ref, sizeof(v_size_row_ref));
if (!has_v_cache) {
if (v_size_row_ref != 0) {
LLAMA_LOG_ERROR("%s: expected empty value row size for layer %d\n", __func__, il);
return false;
}
continue;
}
const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
if (v_size_row != v_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
return false;
}
if (cell_count) {
if (kv_self.v_l[il]->extra) {
read_kv_cache_data_split(ctx, kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head, v_size_row, cell_count, il);
} else {
ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
}
}
}
}
else if (v_state == 1) {
for (uint32_t il = 0; il < n_layer; ++il) {
const uint32_t n_embd_v_gqa = llama_kv_v_row_embd(ctx->model, hparams, il);
const bool has_v_cache = kv_self.v_l[il] != nullptr && need_kv;
int32_t v_type_i_ref;
read_to(&v_type_i_ref, sizeof(v_type_i_ref));
if (!has_v_cache) {
if (v_type_i_ref != -1) {
LLAMA_LOG_ERROR("%s: missing transposed value cache for layer %d\n", __func__, il);
return false;
}
} else {
const int32_t v_type_i = (int32_t) kv_self.v_l[il]->type;
if (v_type_i != v_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
return false;
}
}
uint32_t v_size_el_ref;
read_to(&v_size_el_ref, sizeof(v_size_el_ref));
if (!has_v_cache) {
if (v_size_el_ref != 0) {
LLAMA_LOG_ERROR("%s: expected empty transposed value element size for layer %d\n", __func__, il);
return false;
}
} else {
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
if (v_size_el != v_size_el_ref) {
LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
return false;
}
}
uint32_t n_embd_v_gqa_ref;
read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
if (!has_v_cache) {
if (n_embd_v_gqa_ref != 0) {
LLAMA_LOG_ERROR("%s: expected empty transposed value rows for layer %d\n", __func__, il);
return false;
}
continue;
}
if (n_embd_v_gqa != n_embd_v_gqa_ref) {
LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
return false;
}
if (cell_count) {
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
if (kv_self.v_l[il]->extra) {
throw std::runtime_error("Transposed V cache is not sypported with split mode 'graph'");
}
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
}
}
}
}
uint32_t qnext_state_ref = 0;
read_to(&qnext_state_ref, sizeof(qnext_state_ref));
const bool has_qnext_state = llama_kv_has_qnext_state_storage(kv_self);
if ((qnext_state_ref != 0) != has_qnext_state) {
LLAMA_LOG_ERROR("%s: incompatible qwen3next state cache presence\n", __func__);
return false;
}
if (qnext_state_ref != 0) {
for (uint32_t il = 0; il < n_layer; ++il) {
const bool has_s_cache = il < kv_self.s_l.size() && kv_self.s_l[il] != nullptr;
int32_t s_type_i_ref;
read_to(&s_type_i_ref, sizeof(s_type_i_ref));
if (!has_s_cache) {
if (s_type_i_ref != -1) {
LLAMA_LOG_ERROR("%s: missing qwen3next state cache for layer %d\n", __func__, il);
return false;
}
} else {
const int32_t s_type_i = (int32_t) kv_self.s_l[il]->type;
if (s_type_i != s_type_i_ref) {
LLAMA_LOG_ERROR("%s: mismatched qwen3next state type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
return false;
}
}
uint64_t s_size_row_ref;
read_to(&s_size_row_ref, sizeof(s_size_row_ref));
const uint64_t s_size_row = has_s_cache ? ggml_row_size(kv_self.s_l[il]->type, kv_self.s_l[il]->ne[0]) : 0;
if (s_size_row != s_size_row_ref) {
LLAMA_LOG_ERROR("%s: mismatched qwen3next state row size (%zu != %zu, layer %d)\n",
__func__, (size_t) s_size_row, (size_t) s_size_row_ref, il);
return false;
}
uint32_t s_rows_ref;
read_to(&s_rows_ref, sizeof(s_rows_ref));
uint32_t s_rows = 0;
uint32_t s_dst_row = 0;
if (has_s_cache) {
const uint32_t n_slots = (uint32_t) kv_self.s_l[il]->ne[1];
if (seq_id == -1) {
s_rows = n_slots;
} else if (llama_kv_qnext_seq_id_in_range(kv_self, seq_id)) {
s_rows = 1;
s_dst_row = (uint32_t) seq_id;
}
}
if (s_rows_ref != s_rows) {
LLAMA_LOG_ERROR("%s: mismatched qwen3next state row count (%u != %u, layer %d)\n", __func__, s_rows, s_rows_ref, il);
return false;
}
if (s_rows > 0) {
const size_t s_data_size = s_rows * s_size_row;
const size_t s_dst_offset = (size_t) s_dst_row * s_size_row;
if (kv_self.s_l[il]->extra) {
read_kv_cache_data_split(ctx, kv_self.s_l[il], read(s_data_size), s_dst_row, s_size_row, s_rows, il);
} else {
ggml_backend_tensor_set(kv_self.s_l[il], read(s_data_size), s_dst_offset, s_data_size);
}
}
}
}
return true;
}
void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) {
uint32_t cell_count;
read_to(&cell_count, sizeof(cell_count));
bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count, seq_id, flags);
if (!res) {
if (seq_id == -1) {
llama_kv_cache_clear(ctx);
} else {
llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
}
throw std::runtime_error("failed to restore kv cache");
}
}
};
struct llama_data_write_dummy : llama_data_write {
size_t size_written = 0;
llama_data_write_dummy() {}
void write(const void * , size_t size) override {
size_written += size;
}
void write_tensor_data(const struct ggml_tensor * , size_t , size_t size, int ) override {
size_written += size;
}
size_t get_size_written() override {
return size_written;
}
};
struct llama_data_write_buffer : llama_data_write {
uint8_t * ptr;
size_t buf_size = 0;
size_t size_written = 0;
const llama_model & model;
std::vector<uint8_t> aux_buffer;
llama_data_write_buffer(uint8_t * p, size_t len, const llama_model & _model) : ptr(p), buf_size(len), model(_model) {}
void write(const void * src, size_t size) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
memcpy(ptr, src, size);
ptr += size;
size_written += size;
buf_size -= size;
}
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size, int il) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
if (tensor->extra) {
get_tensor_data_split(tensor, offset, size, il);
} else {
ggml_backend_tensor_get(tensor, ptr, offset, size);
}
ptr += size;
size_written += size;
buf_size -= size;
}
void get_tensor_data_split(const ggml_tensor * tensor, size_t offset, size_t size, int il) {
auto tt = ggml_internal_get_type_traits(tensor->type);
if (tt.row_meta_size > 0) {
throw std::runtime_error(std::string{"Split cache for type "} + ggml_type_name(tensor->type) + " is not supported");
}
GGML_ASSERT(il >= 0 && il < int(model.layers.size()));
if (model.is_mla_model()) {
auto extra = (const ggml_split_tensor_t *)tensor->extra;
for (int id = 0; id < extra->n_device; ++id) {
if (extra->splits[id]) {
ggml_backend_tensor_get(extra->splits[id], ptr, offset, size);
break;
}
}
}
else if (model.hparams.recurrent_layer_arr[il]) {
get_tensor_data_split(ptr, tensor, aux_buffer, offset, size);
} else {
auto kv = get_kv_cache_split_tensor(tensor, model.layers[il]);
get_tensor_data_split(ptr, tensor, kv, aux_buffer, offset, size);
}
}
static void get_tensor_data_split(uint8_t * ptr, const ggml_tensor * tensor, const ggml_tensor * kv,
std::vector<uint8_t> & aux_buffer, size_t offset, size_t size) {
auto ne = kv->ne[1];
auto full_row_size = ggml_row_size(tensor->type, ne);
GGML_ASSERT(offset % full_row_size == 0);
GGML_ASSERT(size % full_row_size == 0);
auto first_row = offset / full_row_size;
auto num_rows = size / full_row_size;
auto extra = (const ggml_split_tensor_t *)tensor->extra;
auto kv_extra = (const ggml_split_tensor_t *)kv->extra;
GGML_ASSERT(extra && kv_extra);
size_t split_offset = 0;
size_t total_size = 0;
for (int id = 0; id < extra->n_device; ++id) {
auto split = extra->splits[id];
auto kv_split = kv_extra->splits[id];
GGML_ASSERT((split && kv_split) || (!split && !kv_split));
if (!split) continue;
GGML_ASSERT(split->type == tensor->type);
auto split_row_size = ggml_row_size(tensor->type, kv_split->ne[1]);
auto split_size = split_row_size * num_rows;
if (split_size > aux_buffer.size()) aux_buffer.resize(split_size);
ggml_backend_tensor_get(split, aux_buffer.data(), first_row*split_row_size, split_size);
auto dst = ptr + split_offset;
auto src = aux_buffer.data();
for (int row = 0; row < (int)num_rows; ++row) {
std::memcpy(dst, src, split_row_size);
dst += full_row_size;
src += split_row_size;
}
split_offset += split_row_size;
total_size += split_row_size * num_rows;
}
GGML_ASSERT(total_size == size);
}
static void get_tensor_data_split(uint8_t * ptr, const ggml_tensor * tensor,
std::vector<uint8_t> & aux_buffer, size_t offset, size_t size) {
auto ne = tensor->ne[0];
auto full_row_size = ggml_row_size(tensor->type, ne);
GGML_ASSERT(offset % full_row_size == 0);
GGML_ASSERT(size % full_row_size == 0);
auto first_row = offset / full_row_size;
auto num_rows = size / full_row_size;
auto extra = (const ggml_split_tensor_t *)tensor->extra;
GGML_ASSERT(extra);
size_t split_offset = 0;
size_t total_size = 0;
for (int id = 0; id < extra->n_device; ++id) {
auto split = extra->splits[id];
if (!split) continue;
GGML_ASSERT(split->type == tensor->type);
auto split_row_size = ggml_row_size(tensor->type, split->ne[0]);
auto split_size = split_row_size * num_rows;
if (split_size > aux_buffer.size()) aux_buffer.resize(split_size);
ggml_backend_tensor_get(split, aux_buffer.data(), first_row*split_row_size, split_size);
auto dst = ptr + split_offset;
auto src = aux_buffer.data();
for (int row = 0; row < (int)num_rows; ++row) {
std::memcpy(dst, src, split_row_size);
dst += full_row_size;
src += split_row_size;
}
split_offset += split_row_size;
total_size += split_row_size * num_rows;
}
GGML_ASSERT(total_size == size);
}
size_t get_size_written() override {
return size_written;
}
};
struct llama_data_read_buffer : llama_data_read {
const uint8_t * ptr;
size_t buf_size = 0;
size_t size_read = 0;
llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
const uint8_t * read(size_t size) override {
const uint8_t * base_ptr = ptr;
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
ptr += size;
size_read += size;
buf_size -= size;
return base_ptr;
}
void read_to(void * dst, size_t size) override {
memcpy(dst, read(size), size);
}
size_t get_size_read() override {
return size_read;
}
};
struct llama_data_write_file : llama_data_write {
llama_file * file;
size_t size_written = 0;
std::vector<uint8_t> temp_buffer;
std::vector<uint8_t> aux_buffer;
const llama_model & model;
llama_data_write_file(llama_file * f, const llama_model & _model) : file(f), model(_model) {}
void write(const void * src, size_t size) override {
file->write_raw(src, size);
size_written += size;
}
void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size, int il) override {
temp_buffer.resize(size);
if (tensor->extra) {
get_tensor_data_split(tensor, offset, size, il);
} else {
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
}
write(temp_buffer.data(), temp_buffer.size());
}
void get_tensor_data_split(const struct ggml_tensor * tensor, size_t offset, size_t size, int il) {
GGML_ASSERT(il >= 0 && il < int(model.layers.size()));
if (model.is_mla_model()) {
auto extra = (ggml_split_tensor_t *)tensor->extra;
for (int id = 0; id < extra->n_device; ++id) {
if (extra->splits[id]) {
ggml_backend_tensor_get(extra->splits[id], temp_buffer.data(), offset, size);
break;
}
}
return;
}
auto kv = get_kv_cache_split_tensor(tensor, model.layers[il]);
temp_buffer.resize(size);
llama_data_write_buffer::get_tensor_data_split(temp_buffer.data(), tensor, kv, aux_buffer, offset, size);
}
size_t get_size_written() override {
return size_written;
}
};
struct llama_data_read_file : llama_data_read {
llama_file * file;
size_t size_read = 0;
std::vector<uint8_t> temp_buffer;
llama_data_read_file(llama_file * f) : file(f) {}
void read_to(void * dst, size_t size) override {
file->read_raw(dst, size);
size_read += size;
}
const uint8_t * read(size_t size) override {
temp_buffer.resize(size);
read_to(temp_buffer.data(), size);
return temp_buffer.data();
}
size_t get_size_read() override {
return size_read;
}
};
static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
llama_synchronize(ctx);
data_ctx.write_model_info(ctx);
data_ctx.write_rng(ctx->sampling.rng);
data_ctx.write_output_ids(ctx);
data_ctx.write_logits(ctx);
data_ctx.write_embeddings(ctx);
data_ctx.write_kv_cache(ctx);
return data_ctx.get_size_written();
}
size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
llama_data_write_buffer data_ctx(dst, size, ctx->model);
try {
return llama_state_get_data_internal(ctx, data_ctx);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
return 0;
}
}
size_t llama_state_get_size(struct llama_context * ctx) {
llama_data_write_dummy data_ctx;
try {
return llama_state_get_data_internal(ctx, data_ctx);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
return 0;
}
}
static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
llama_synchronize(ctx);
data_ctx.read_model_info(ctx);
data_ctx.read_rng(ctx->sampling.rng);
data_ctx.read_output_ids(ctx);
data_ctx.read_logits(ctx);
data_ctx.read_embeddings(ctx);
data_ctx.read_kv_cache(ctx);
return data_ctx.get_size_read();
}
size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
llama_data_read_buffer data_ctx(src, size);
try {
return llama_state_set_data_internal(ctx, data_ctx);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
return 0;
}
}
static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(path_session, "rb");
{
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
return false;
}
}
{
const uint32_t n_token_count = file.read_u32();
if (n_token_count > n_token_capacity) {
LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
return false;
}
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
*n_token_count_out = n_token_count;
}
{
const size_t n_state_size_cur = file.size() - file.tell();
llama_data_read_file data_ctx(&file);
const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
if (n_read != n_state_size_cur) {
LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
return false;
}
}
return true;
}
bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
try {
return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
return false;
}
}
static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
llama_file file(path_session, "wb");
file.write_u32(LLAMA_SESSION_MAGIC);
file.write_u32(LLAMA_SESSION_VERSION);
file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
llama_data_write_file data_ctx(&file, ctx->model);
llama_state_get_data_internal(ctx, data_ctx);
return true;
}
bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
try {
return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
return false;
}
}
static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
llama_synchronize(ctx);
data_ctx.write_kv_cache(ctx, seq_id, flags);
return data_ctx.get_size_written();
}
size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id, llama_state_seq_flags flags) {
llama_data_write_dummy data_ctx;
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id, flags);
}
size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id, llama_state_seq_flags flags) {
llama_data_write_buffer data_ctx(dst, size, ctx->model);
try {
return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id, flags);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
return 0;
}
}
static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id, llama_state_seq_flags flags) {
llama_synchronize(ctx);
data_ctx.read_kv_cache(ctx, dest_seq_id, flags);
return data_ctx.get_size_read();
}
size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id, llama_state_seq_flags flags) {
llama_data_read_buffer data_ctx(src, size);
try {
return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id, flags);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
return 0;
}
}
static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
llama_file file(filepath, "wb");
file.write_u32(LLAMA_STATE_SEQ_MAGIC);
file.write_u32(LLAMA_STATE_SEQ_VERSION);
file.write_u32((uint32_t) n_token_count);
file.write_raw(tokens, sizeof(llama_token) * n_token_count);
llama_data_write_file data_ctx(&file, ctx->model);
llama_state_seq_get_data_internal(ctx, data_ctx, seq_id, 0);
const size_t res = file.tell();
GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
return res;
}
static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
llama_file file(filepath, "rb");
{
const uint32_t magic = file.read_u32();
const uint32_t version = file.read_u32();
if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
return 0;
}
}
{
const uint32_t n_token_count = file.read_u32();
if (n_token_count > n_token_capacity) {
LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
return 0;
}
file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
*n_token_count_out = n_token_count;
}
{
const size_t state_size = file.size() - file.tell();
llama_data_read_file data_ctx(&file);
const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id, 0);
if (!nread) {
LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
return 0;
}
GGML_ASSERT(nread <= state_size);
GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
}
return file.tell();
}
size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
try {
return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
return 0;
}
}
size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
try {
return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
return 0;
}
}
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
ctx->cparams.n_threads = n_threads;
ctx->cparams.n_threads_batch = n_threads_batch;
}
uint32_t llama_n_threads(struct llama_context * ctx) {
return ctx->cparams.n_threads;
}
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
return ctx->cparams.n_threads_batch;
}
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
ctx->abort_callback = abort_callback;
ctx->abort_callback_data = abort_callback_data;
}
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
ctx->cparams.embeddings = embeddings;
}
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
ctx->cparams.causal_attn = causal_attn;
}
struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens,
llama_pos pos_0,
llama_seq_id seq_id) {
return {
n_tokens,
tokens,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
pos_0,
1,
seq_id,
};
}
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
} else {
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
}
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
for (int i = 0; i < n_tokens_alloc; ++i) {
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
}
batch.seq_id[n_tokens_alloc] = nullptr;
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
return batch;
}
void llama_batch_free(struct llama_batch batch) {
if (batch.token) free(batch.token);
if (batch.embd) free(batch.embd);
if (batch.pos) free(batch.pos);
if (batch.n_seq_id) free(batch.n_seq_id);
if (batch.seq_id) {
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
free(batch.seq_id[i]);
}
free(batch.seq_id);
}
if (batch.logits) free(batch.logits);
}
int32_t llama_encode(
struct llama_context * ctx,
struct llama_batch batch) {
const int ret = llama_encode_internal(*ctx, batch);
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
}
return ret;
}
int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch) {
const int ret = llama_decode_internal(*ctx, batch);
if (ret < 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
}
return ret;
}
void llama_set_mtp_op_type(llama_context * ctx, llama_mtp_op_type mtp_op_type) {
ctx->set_mtp_op_type(mtp_op_type);
}
void llama_synchronize(struct llama_context * ctx) {
ggml_backend_sched_synchronize(ctx->sched);
if (ctx->n_queued_tokens == 1) {
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
ctx->n_eval++;
} else if (ctx->n_queued_tokens > 1) {
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
ctx->n_p_eval += ctx->n_queued_tokens;
}
if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
ctx->has_evaluated_once = true;
}
ctx->n_queued_tokens = 0;
ctx->t_compute_start_us = 0;
}
float * llama_get_logits(struct llama_context * ctx) {
llama_synchronize(ctx);
return ctx->logits;
}
float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
int32_t j = -1;
llama_synchronize(ctx);
try {
if (ctx->logits == nullptr) {
throw std::runtime_error("no logits");
}
if (i < 0) {
j = ctx->n_outputs + i;
if (j < 0) {
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
}
} else if ((size_t) i >= ctx->output_ids.size()) {
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
} else {
j = ctx->output_ids[i];
}
if (j < 0) {
throw std::runtime_error(format("batch.logits[%d] != true", i));
}
if (j >= ctx->n_outputs) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
}
return ctx->logits + j*ctx->model.hparams.n_vocab;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
GGML_ABORT("fatal error");
#endif
return nullptr;
}
}
float * llama_get_embeddings(struct llama_context * ctx) {
llama_synchronize(ctx);
return ctx->embd;
}
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
int32_t j = -1;
llama_synchronize(ctx);
try {
if (ctx->embd == nullptr) {
throw std::runtime_error("no embeddings");
}
if (i < 0) {
j = ctx->n_outputs_embd + i;
if (j < 0) {
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs_embd));
}
} else if ((size_t) i >= ctx->output_ids.size()) {
throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size()));
} else {
j = ctx->output_ids[i];
}
if (j < 0) {
throw std::runtime_error(format("batch.logits[%d] != true", i));
}
if (j >= ctx->n_outputs_embd) {
throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs_embd=%d)", j, ctx->n_outputs_embd));
}
return ctx->embd + (size_t) j * llama_output_embd_width(*ctx);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
#ifndef NDEBUG
GGML_ABORT("fatal error");
#endif
return nullptr;
}
}
float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
llama_synchronize(ctx);
auto it = ctx->embd_seq.find(seq_id);
if (it == ctx->embd_seq.end()) {
return nullptr;
}
return it->second.data();
}
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
return model->vocab.token_get_text(token);
}
float llama_token_get_score(const struct llama_model * model, llama_token token) {
return model->vocab.token_get_score(token);
}
enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token) {
return model->vocab.token_get_attr(token);
}
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
return model->vocab.is_eog(token);
}
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
return model->vocab.is_control(token);
}
llama_token llama_token_bos(const struct llama_model * model) {
return model->vocab.token_bos();
}
llama_token llama_token_eos(const struct llama_model * model) {
return model->vocab.token_eos();
}
llama_token llama_token_sep(const struct llama_model * model) {
return model->vocab.token_sep();
}
llama_token llama_token_nl (const struct llama_model * model) {
return model->vocab.token_nl();
}
llama_token llama_token_pad(const struct llama_model * model) {
return model->vocab.token_pad();
}
int32_t llama_add_bos_token(const struct llama_model * model) {
return model->vocab.get_add_bos();
}
int32_t llama_add_eos_token(const struct llama_model * model) {
return model->vocab.get_add_eos();
}
llama_token llama_token_prefix(const struct llama_model * model) {
return model->vocab.token_prefix();
}
llama_token llama_token_middle(const struct llama_model * model) {
return model->vocab.token_middle();
}
llama_token llama_token_suffix(const struct llama_model * model) {
return model->vocab.token_suffix();
}
llama_token llama_token_eot(const struct llama_model * model) {
return model->vocab.token_eot();
}
int32_t llama_tokenize(
const struct llama_model * model,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special) {
return model->vocab.tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
}
int32_t llama_token_to_piece(
const struct llama_model * model,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special) {
return model->vocab.token_to_piece(token, buf, length, lstrip, special);
}
int32_t llama_detokenize(
const struct llama_vocab * vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special) {
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
}
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
if (auto it = LLM_CHAT_TEMPLATES.find(tmpl); it != LLM_CHAT_TEMPLATES.end()) {
return it->second;
}
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
return tmpl.find(haystack) != std::string::npos;
};
if (tmpl_contains("<|im_start|>")) {
return LLM_CHAT_TEMPLATE_CHATML;
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
if (tmpl_contains("[SYSTEM_PROMPT]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
} else if (
tmpl_contains("' [INST] ' + system_message")
|| tmpl_contains("[AVAILABLE_TOOLS]")
) {
if (tmpl_contains(" [INST]")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V1;
} else if (tmpl_contains("\"[INST]\"")) {
return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
}
return LLM_CHAT_TEMPLATE_MISTRAL_V3;
} else {
bool support_system_message = tmpl_contains("<<SYS>>");
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
bool strip_message = tmpl_contains("content.strip()");
if (strip_message) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
} else if (add_bos_inside_history) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
} else if (support_system_message) {
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
} else {
return LLM_CHAT_TEMPLATE_LLAMA_2;
}
}
} else if (tmpl_contains("[gMASK]sop")) {
return LLM_CHAT_TEMPLATE_CHATGLM_3;
} else if (tmpl_contains("[gMASK]<sop>")) {
return LLM_CHAT_TEMPLATE_CHATGLM_4;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
return LLM_CHAT_TEMPLATE_PHI_3;
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
return LLM_CHAT_TEMPLATE_FALCON_3;
} else if (tmpl == "falcon_e" && (tmpl_contains("assistant") && tmpl_contains("user"))) {
return LLM_CHAT_TEMPLATE_FALCON_E;
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
return LLM_CHAT_TEMPLATE_ZEPHYR;
} else if (tmpl_contains("bos_token + message['role']")) {
return LLM_CHAT_TEMPLATE_MONARCH;
} else if (tmpl_contains("<start_of_turn>")) {
return LLM_CHAT_TEMPLATE_GEMMA;
} else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
return LLM_CHAT_TEMPLATE_ORION;
} else if (tmpl_contains("GPT4 Correct ")) {
return LLM_CHAT_TEMPLATE_OPENCHAT;
} else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
if (tmpl_contains("SYSTEM: ")) {
return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
}
return LLM_CHAT_TEMPLATE_VICUNA;
} else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
return LLM_CHAT_TEMPLATE_DEEPSEEK;
} else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
return LLM_CHAT_TEMPLATE_COMMAND_R;
} else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
return LLM_CHAT_TEMPLATE_LLAMA_3;
} else if (tmpl_contains(LU8("<用户>"))) {
return LLM_CHAT_TEMPLATE_MINICPM;
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
} else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
return LLM_CHAT_TEMPLATE_EXAONE_3;
} else if (tmpl_contains("rwkv-world")) {
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
} else if (tmpl_contains("<|start_of_role|>")) {
return LLM_CHAT_TEMPLATE_GRANITE;
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
return LLM_CHAT_TEMPLATE_GIGACHAT;
} else if (tmpl_contains("<|role_start|>")) {
return LLM_CHAT_TEMPLATE_MEGREZ;
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
return LLM_CHAT_TEMPLATE_BAILING;
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("\"HUMAN\"") && tmpl_contains("<think>")) {
return LLM_CHAT_TEMPLATE_BAILING_THINK;
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("<role>HUMAN</role>") && tmpl_contains("<|role_end|>")) {
return LLM_CHAT_TEMPLATE_BAILING2;
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
return LLM_CHAT_TEMPLATE_LLAMA4;
} else if (tmpl_contains("<|endofuserprompt|>")) {
return LLM_CHAT_TEMPLATE_DOTS1;
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
} else if (tmpl_contains("<|im_middle|>") && tmpl_contains("<|im_end|>")) {
return LLM_CHAT_TEMPLATE_KIMI_K2;
} else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
return LLM_CHAT_TEMPLATE_GROK_2;
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
} else if (tmpl_contains("<seed:bos>")) {
return LLM_CHAT_TEMPLATE_SEED_OSS;
}
return LLM_CHAT_TEMPLATE_UNKNOWN;
}
static int32_t llama_chat_apply_template_internal(
const llm_chat_template tmpl,
const std::vector<const llama_chat_message *> & chat,
std::string & dest, bool add_ass) {
std::stringstream ss;
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
for (auto message : chat) {
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
}
if (add_ass) {
ss << "<|im_start|>assistant\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
for (auto message : chat) {
std::string role(message->role);
std::string content(message->content);
if (role == "system") {
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
} else if (role == "user") {
ss << "[INST] " << content << "[/INST]";
}
else {
ss << " " << content << "</s>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
bool is_inside_turn = false;
for (auto message : chat) {
if (!is_inside_turn) {
ss << leading_space << "[INST]" << trailing_space;
is_inside_turn = true;
}
std::string role(message->role);
std::string content(message->content);
if (role == "system") {
ss << content << "\n\n";
} else if (role == "user") {
ss << content << leading_space << "[/INST]";
} else {
ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
is_inside_turn = false;
}
}
} else if (
tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
bool is_inside_turn = true; ss << "[INST] ";
for (auto message : chat) {
std::string content = strip_message ? trim(message->content) : message->content;
std::string role(message->role);
if (!is_inside_turn) {
is_inside_turn = true;
ss << (add_bos_inside_history ? "<s>[INST] " : "[INST] ");
}
if (role == "system") {
if (support_system_message) {
ss << "<<SYS>>\n" << content << "\n<</SYS>>\n\n";
} else {
ss << content << "\n";
}
} else if (role == "user") {
ss << content << " [/INST]";
} else {
ss << content << "</s>";
is_inside_turn = false;
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
}
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>\n" << message->content << "\n";
}
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_E) {
for (auto message : chat) {
std::string role(message->role);
ss << role << message->content << "\n";
}
if (add_ass) {
ss << "assistant\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
for (auto message : chat) {
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
}
if (add_ass) {
ss << "<|assistant|>\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
for (auto message : chat) {
std::string bos = (message == chat.front()) ? "" : "<s>"; ss << bos << message->role << "\n" << message->content << "</s>\n";
}
if (add_ass) {
ss << "<s>assistant\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
std::string system_prompt = "";
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
system_prompt = trim(message->content);
continue;
}
role = role == "assistant" ? "model" : message->role;
ss << "<start_of_turn>" << role << "\n";
if (!system_prompt.empty() && role != "model") {
ss << system_prompt << "\n\n";
system_prompt = "";
}
ss << trim(message->content) << "<end_of_turn>\n";
}
if (add_ass) {
ss << "<start_of_turn>model\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
std::string system_prompt = "";
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
system_prompt = message->content;
continue;
} else if (role == "user") {
ss << "Human: ";
if (!system_prompt.empty()) {
ss << system_prompt << "\n\n";
system_prompt = "";
}
ss << message->content << "\n\nAssistant: </s>";
} else {
ss << message->content << "</s>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << message->content << "<|end_of_turn|>";
} else {
role[0] = toupper(role[0]);
ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>";
}
}
if (add_ass) {
ss << "GPT4 Correct Assistant:";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
ss << "SYSTEM: " << message->content << "\n";
} else {
ss << message->content << "\n\n";
}
} else if (role == "user") {
ss << "USER: " << message->content << "\n";
} else if (role == "assistant") {
ss << "ASSISTANT: " << message->content << "</s>\n";
}
}
if (add_ass) {
ss << "ASSISTANT:";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << message->content;
} else if (role == "user") {
ss << "### Instruction:\n" << message->content << "\n";
} else if (role == "assistant") {
ss << "### Response:\n" << message->content << "\n<|EOT|>\n";
}
}
if (add_ass) {
ss << "### Response:\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
} else if (role == "user") {
ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
} else if (role == "assistant") {
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>";
}
}
if (add_ass) {
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>";
}
if (add_ass) {
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
ss << "[gMASK]" << "sop";
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n " << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
ss << "[gMASK]" << "<sop>";
for (auto message : chat) {
std::string role(message->role);
ss << "<|" << role << "|>" << "\n" << message->content;
}
if (add_ass) {
ss << "<|assistant|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
for (auto message : chat) {
std::string role(message->role);
if (role == "user") {
ss << LU8("<用户>");
ss << trim(message->content);
ss << "<AI>";
} else {
ss << trim(message->content);
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << message->content << "\n\n";
} else if (role == "user") {
ss << "User: " << message->content << "\n\n";
} else if (role == "assistant") {
ss << "Assistant: " << message->content << LU8("<|end▁of▁sentence|>");
}
}
if (add_ass) {
ss << "Assistant:";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_3) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << message->content << "\n\n";
} else if (role == "user") {
ss << LU8("<|User|>") << message->content;
} else if (role == "assistant") {
ss << LU8("<|Assistant|>") << message->content << LU8("<|end▁of▁sentence|>");
}
}
if (add_ass) {
ss << LU8("<|Assistant|>");
}
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
} else if (role == "user") {
ss << "[|user|]" << trim(message->content) << "\n";
} else if (role == "assistant") {
ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
}
}
if (add_ass) {
ss << "[|assistant|]";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
for (auto message : chat) {
std::string role(message->role);
if (role == "user") {
ss << "User: " << message->content << "\n\nAssistant:";
} else {
ss << message->content << "\n\n";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
for (const auto & message : chat) {
std::string role(message->role);
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
if (role == "assistant_tool_call") {
ss << "<|tool_call|>";
}
ss << message->content << "<|end_of_text|>\n";
}
if (add_ass) {
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
if (has_system) {
ss << "<s>" << chat[0]->content << "<|message_sep|>";
} else {
ss << "<s>";
}
for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
std::string role(chat[i]->role);
if (role == "user") {
ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
<< "available functions<|role_sep|>[]<|message_sep|>";
} else if (role == "assistant") {
ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
}
}
if (add_ass) {
ss << "assistant<|role_sep|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>";
}
if (add_ass) {
ss << "<|role_start|>assistant<|role_end|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING || tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
for (auto message : chat) {
std::string role(message->role);
if (role == "user") {
role = "HUMAN";
} else {
std::transform(role.begin(), role.end(), role.begin(), ::toupper);
}
ss << "<role>" << role << "</role>" << message->content;
}
if (add_ass) {
ss << "<role>ASSISTANT</role>";
if (tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
ss << "<think>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING2) {
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
if (!has_system) {
ss << "<role>SYSTEM</role>detailed thinking off<|role_end|>";
}
for (auto message : chat) {
std::string role(message->role);
if (role == "user") {
role = "HUMAN";
} else {
std::transform(role.begin(), role.end(), role.begin(), ::toupper);
}
ss << "<role>" << role << "</role>" << message->content << "<|role_end|>";
}
if (add_ass) {
ss << "<role>ASSISTANT</role>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
}
if (add_ass) {
ss << "<|header_start|>assistant<|header_end|>\n\n";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_BITNET) {
std::string system_prompt = "";
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "System: ";
ss << message->content;
} else if (role == "user") {
ss << "User: ";
if (!system_prompt.empty()) {
ss << system_prompt;
system_prompt = "";
}
ss << message->content << "<|eot_id|>Assistant: ";
} else {
ss << message->content;
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "<|system|>" << message->content << "<|endofsystem|>";
} else if (role == "user") {
ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
} else {
ss << "<|response|>" << message->content << "<|endofresponse|>";
}
}
if (add_ass) {
ss << "<|response|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
} else if (role == "assistant") {
ss << "<|startoftext|>" << message->content << "<|eos|>";
} else {
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
}
}
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "<|im_system|>system<|im_middle|>" << message->content << "<|im_end|>";
} else if (role == "assistant") {
ss << "<|im_user|>user<|im_middle|>" << message->content << "<|im_end|>";
} else {
ss << "<|im_assistant|>assistant<|im_middle|>" << message->content << "<|im_end|>";
}
}
if (add_ass) {
ss << "<|im_assistant|>assistant<|im_middle|>";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENAI_MOE) {
for (auto message : chat) {
std::string role(message->role);
ss << "<|start|>" << role << "<|message|>" << message->content;
ss << (role == "assistant" ? "<|return|>" : "<|end|>");
}
if (add_ass) {
ss << "<|start|>assistant";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
for (auto message : chat) {
std::string role(message->role);
if (role == "system") {
ss << "System: " << trim(message->content) << "<|separator|>\n\n";
}
else if (role == "user") {
ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
}
else if (role == "assistant") {
ss << "Assistant: " << message->content << "<|separator|>\n\n";
}
}
if (add_ass) {
ss << "Assistant:";
}
} else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) {
for (auto message: chat) {
std::string role(message->role);
ss << "<seed:bos>" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << "<seed:eos>";
}
if (add_ass) {
ss << "<seed:bos>assistant\n";
}
} else {
return -1;
}
dest = ss.str();
return dest.size();
}
int32_t llama_chat_apply_template(
const char * tmpl,
const struct llama_chat_message * chat,
size_t n_msg,
bool add_ass,
char * buf,
int32_t length) {
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
std::vector<const llama_chat_message *> chat_vec;
chat_vec.resize(n_msg);
for (size_t i = 0; i < n_msg; i++) {
chat_vec[i] = &chat[i];
}
std::string formatted_chat;
llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
return -1;
}
int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
if (res < 0) {
return res;
}
if (buf && length > 0) {
strncpy(buf, formatted_chat.c_str(), length);
}
return res;
}
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
auto it = LLM_CHAT_TEMPLATES.begin();
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
output[i] = it->first.c_str();
std::advance(it, 1);
}
return (int32_t) LLM_CHAT_TEMPLATES.size();
}
void llama_grammar_free(struct llama_grammar * grammar) {
llama_grammar_free_impl(grammar);
}
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
return llama_grammar_clone_impl(*grammar);
}
void llama_grammar_apply(
const struct llama_grammar * grammar,
const struct llama_context * ctx,
llama_token_data_array * candidates) {
llama_grammar_sample_impl(grammar, &ctx->model.vocab, &ctx->sampling, candidates);
}
void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar) {
llama_grammar_apply(grammar, ctx, candidates);
}
void llama_grammar_accept_token(
struct llama_grammar * grammar,
struct llama_context * ctx,
llama_token token) {
llama_grammar_accept_impl(*grammar, &ctx->model.vocab, &ctx->sampling, token);
}
void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed) {
llama_set_rng_seed_impl(&ctx->sampling, seed);
}
void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates, true);
}
void llama_sample_dist(struct llama_context * ctx, llama_token_data_array * candidates) {
llama_sample_softmax_impl(ctx ? &ctx->sampling : nullptr, candidates, false);
}
void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int32_t k, size_t min_keep) {
llama_sample_top_k_impl(ctx ? &ctx->sampling : nullptr, candidates, k, min_keep);
}
void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
llama_sample_top_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
}
void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
llama_sample_min_p_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
}
void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
llama_sample_tail_free_impl(ctx ? &ctx->sampling : nullptr, candidates, z, min_keep);
}
void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
llama_sample_typical_impl(ctx ? &ctx->sampling : nullptr, candidates, p, min_keep);
}
void llama_sample_entropy(struct llama_context * ctx, llama_token_data_array * candidates_p, float min_temp, float max_temp, float exponent_val) {
llama_sample_entropy_impl(ctx ? &ctx->sampling : nullptr, candidates_p, min_temp, max_temp, exponent_val);
}
void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
llama_sample_temp_impl(ctx ? &ctx->sampling : nullptr, candidates_p, temp);
}
void llama_sample_xtc(struct llama_context * ctx, llama_token_data_array * candidates_p,
float probability, float threshold, size_t min_keep) {
llama_sample_xtc_impl(ctx ? &ctx->sampling : nullptr, candidates_p, probability, threshold, min_keep);
}
void llama_sample_top_n_sigma(struct llama_context * ctx, llama_token_data_array * candidates_p, float top_n_sigma) {
llama_sample_top_n_sigma_impl(ctx ? &ctx->sampling : nullptr, candidates_p, top_n_sigma);
}
void llama_sample_dry([[maybe_unused]] struct llama_context* ctx, struct llama_sampler_dry* smpl, llama_token_data_array* candidates_p) {
llama_sampler_dry_apply(smpl, candidates_p);
}
void llama_sample_adaptive_p(llama_context * ctx,
llama_token_data_array * candidates,
llama_sampler_adaptive_p * adapt_p_ctx) {
llama_sample_adaptive_p_impl(&ctx->sampling, candidates, adapt_p_ctx);
}
void llama_prep_adaptive_p(struct llama_context * ctx, llama_token_data_array * candidates, struct llama_sampler_adaptive_p * adapt_p_ctx) {
llama_prep_adaptive_p_impl(&ctx->sampling, candidates, adapt_p_ctx);
}
void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present) {
llama_sample_repetition_penalties_impl(ctx ? &ctx->sampling : nullptr, candidates, last_tokens, penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
}
void llama_sample_apply_guidance(
struct llama_context * ctx,
float * logits,
float * logits_guidance,
float scale) {
llama_sample_apply_guidance_impl(&ctx->sampling, logits, logits_guidance, scale);
}
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
return llama_sample_token_mirostat_impl(&ctx->sampling, candidates, tau, eta, m, mu);
}
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
return llama_sample_token_mirostat_v2_impl(ctx ? &ctx->sampling : nullptr, candidates, tau, eta, mu);
}
llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
return llama_sample_token_greedy_impl(ctx ? &ctx->sampling : nullptr, candidates);
}
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) {
return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, rng);
}
llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
return llama_sample_token_with_rng_impl(&ctx->sampling, candidates, ctx->sampling.rng);
}
llama_token llama_sample_token_adaptive_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_sampler_adaptive_p * adapt_p_ctx) {
return llama_sample_token_adaptive_p_impl(&ctx->sampling, candidates, adapt_p_ctx);
}
int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) {
static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf";
if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) {
return strlen(split_path);
}
return 0;
}
struct llama_sampler_dry * llama_sampler_init_dry(const struct llama_vocab* vocab, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
return llama_sampler_init_dry_impl(*vocab, vocab->n_tokens(), dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, seq_breakers, num_breakers);
}
void llama_sampler_dry_reset(struct llama_sampler_dry* smpl) {
if (!smpl) {
return;
}
smpl->last_tokens.clear();
smpl->dry_repeat_count.clear();
smpl->dry_max_token_repeat.clear();
}
void llama_sampler_dry_free(struct llama_sampler_dry* smpl) {
delete smpl;
}
struct llama_sampler_dry* llama_sampler_dry_clone(struct llama_sampler_dry* smpl) {
if (!smpl) {
return nullptr;
}
return new llama_sampler_dry {
smpl->total_context_size,
smpl->dry_multiplier,
smpl->dry_base,
smpl->dry_allowed_length,
smpl->dry_penalty_last_n,
smpl->dry_processed_breakers,
smpl->dry_repeat_count,
smpl->dry_max_token_repeat,
smpl->last_tokens,
};
}
void llama_sampler_dry_accept(struct llama_sampler_dry* smpl, llama_token token) {
if (!smpl) {
return;
}
if (smpl->dry_multiplier == 0.0f || smpl->dry_base < 1.0f || smpl->dry_penalty_last_n == 0) {
return;
}
smpl->last_tokens.push_back(token);
}
struct llama_sampler_adaptive_p * llama_init_adaptive_p(int n_vocab, const float target, const float decay, const bool updt_w_cur, const uint32_t seed) {
return llama_init_adaptive_p_impl(n_vocab, target, decay, updt_w_cur, seed);
}
struct llama_sampler_adaptive_p * llama_clone_adaptive_p(const struct llama_sampler_adaptive_p * adapt_p_ctx) {
if (adapt_p_ctx == nullptr) {
return nullptr;
}
return new llama_sampler_adaptive_p(*adapt_p_ctx);
}
void llama_free_adaptive_p(struct llama_sampler_adaptive_p * adapt_p_ctx) {
delete adapt_p_ctx;
}
void llama_review_adaptive_p(struct llama_sampler_adaptive_p * adapt_p_ctx, const size_t n_unsent, const bool rewind_status) {
llama_review_adaptive_p_impl(adapt_p_ctx, n_unsent, rewind_status);
}
int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) {
std::string str_split_path(split_path);
char postfix[32];
snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count);
std::string str_postfix(postfix);
int size_prefix = str_split_path.size() - str_postfix.size();
if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) {
snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path);
return size_prefix;
}
return 0;
}
struct llama_timings llama_get_timings(struct llama_context * ctx) {
struct llama_timings result = {
1e-3 * ctx->t_start_us,
1.00 * ggml_time_ms(),
1e-3 * ctx->t_load_us,
1e-3 * ctx->sampling.t_sample_us,
1e-3 * ctx->t_p_eval_us,
1e-3 * ctx->t_eval_us,
std::max(1, ctx->sampling.n_sample),
std::max(0, ctx->n_p_eval),
std::max(1, ctx->n_eval),
};
return result;
}
void llama_print_timings(struct llama_context * ctx) {
const llama_timings timings = llama_get_timings(ctx);
LLAMA_LOG_INFO("\n");
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
}
void llama_reset_timings(struct llama_context * ctx) {
ctx->t_start_us = ggml_time_us();
ctx->t_eval_us = ctx->n_eval = 0;
ctx->t_p_eval_us = ctx->n_p_eval = 0;
ctx->sampling.reset_timings();
}
const char * llama_print_system_info(void) {
static std::string s;
s = "";
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
return s.c_str();
}
void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
fprintf(stream, "\n");
fprintf(stream, "###########\n");
fprintf(stream, "# Timings #\n");
fprintf(stream, "###########\n");
fprintf(stream, "\n");
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
fprintf(stream, "mst_sample: %.2f # ms / token during sampling\n",
1.0e-3 * ctx->sampling.t_sample_us / ctx->sampling.n_sample);
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
fprintf(stream, "n_sample: %d # number of sampled tokens\n", ctx->sampling.n_sample);
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
fprintf(stream, "t_sample_us: %" PRId64 " # total microseconds spent sampling\n", ctx->sampling.t_sample_us);
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
1.0e6 * ctx->n_eval / ctx->t_eval_us);
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
fprintf(stream, "ts_sample: %.2f # tokens / second during sampling\n",
1.0e6 * ctx->sampling.n_sample / ctx->sampling.t_sample_us);
}
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
) {
return ctx->model.tensors_by_name;
}
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
g_state.log_callback_user_data = user_data;
#ifdef GGML_USE_METAL
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#elif defined(GGML_USE_CUDA)
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#elif defined(GGML_USE_CANN)
ggml_backend_cann_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
#endif
}
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
va_list args_copy;
va_copy(args_copy, args);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
g_state.log_callback(level, buffer, g_state.log_callback_user_data);
} else {
char* buffer2 = new char[len+1];
vsnprintf(buffer2, len+1, format, args_copy);
buffer2[len] = 0;
g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
delete[] buffer2;
}
va_end(args_copy);
}
void llama_log_internal(ggml_log_level level, const char * format, ...) {
va_list args;
va_start(args, format);
llama_log_internal_v(level, format, args);
va_end(args);
}
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}
void llama_set_offload_policy(struct llama_context * lctx, int op, bool on_or_off) {
if (!lctx || !lctx->sched) return;
const char * op_name = op < 0 || op >= int(GGML_OP_COUNT) ? "all ops" : ggml_op_name(ggml_op(op));
LLAMA_LOG_INFO("XXXXXXXXXXXXXXXXXXXXXXXXXXXX offload(%s) = %d\n", op_name, on_or_off);
ggml_backend_sched_set_op_offload(lctx->sched, ggml_op(op), on_or_off);
}
void llama_set_draft_input_hidden_state(struct llama_context * ctx, const float * hidden_state) {
ctx->draft_input_hidden_state_owned.clear();
ctx->draft_input_hidden_state = hidden_state;
ctx->draft_input_hidden_state_n_floats = ctx->inp_mtp_states
? ggml_nbytes(ctx->inp_mtp_states) / sizeof(float)
: 0;
}
void llama_set_mtp_target_context(struct llama_context * ctx, struct llama_context * target_ctx) {
ctx->mtp_target_ctx = target_ctx;
}
size_t llama_fill_from_utf8(void* utf8, void* cpts, void* scripts) {
return unicode_fill_from_utf8((std::string*)utf8, (std::vector<uint32_t>*)cpts, (std::vector<std::string>*)scripts);
}