#pragma once
#include "llama-impl.h"
#include "llama-arch.h"
#include "llama-mmap.h"
#include "llama-vocab.h"
#include "llama-hparams.h"
#include "ggml-backend.h"
#include <vector>
#include <unordered_map>
#include <set>
enum e_model {
MODEL_UNKNOWN,
MODEL_14M,
MODEL_17M,
MODEL_22M,
MODEL_33M,
MODEL_60M,
MODEL_70M,
MODEL_80M,
MODEL_109M,
MODEL_137M,
MODEL_140M,
MODEL_160M,
MODEL_190M,
MODEL_220M,
MODEL_250M,
MODEL_256M,
MODEL_270M,
MODEL_335M,
MODEL_350M,
MODEL_360M,
MODEL_410M,
MODEL_450M,
MODEL_475M,
MODEL_558M,
MODEL_700M,
MODEL_770M,
MODEL_780M,
MODEL_950M,
MODEL_0_3B,
MODEL_0_5B,
MODEL_0_6B,
MODEL_0_8B,
MODEL_1B,
MODEL_1_2B,
MODEL_1_3B,
MODEL_1_4B,
MODEL_1_5B,
MODEL_1_6B,
MODEL_1_7B,
MODEL_1_8B,
MODEL_2B,
MODEL_2_6B,
MODEL_2_8B,
MODEL_2_9B,
MODEL_3B,
MODEL_4B,
MODEL_6B,
MODEL_6_9B,
MODEL_7B,
MODEL_8B,
MODEL_9B,
MODEL_11B,
MODEL_12B,
MODEL_13B,
MODEL_14B,
MODEL_15B,
MODEL_16B,
MODEL_20B,
MODEL_27B,
MODEL_30B,
MODEL_32B,
MODEL_34B,
MODEL_35B,
MODEL_36B,
MODEL_40B,
MODEL_65B,
MODEL_70B,
MODEL_120B,
MODEL_142B,
MODEL_236B,
MODEL_290B,
MODEL_314B,
MODEL_405B,
MODEL_671B,
MODEL_SMALL,
MODEL_MEDIUM,
MODEL_LARGE,
MODEL_XL,
MODEL_A1_7B,
MODEL_A2_7B,
MODEL_8x7B,
MODEL_8x22B,
MODEL_16x12B,
MODEL_16x3_8B,
MODEL_10B_128x3_66B,
MODEL_57B_A14B,
MODEL_17B_16E,
MODEL_17B_128E,
MODEL_A13B,
MODEL_7B_A1B,
MODEL_8B_A1B,
MODEL_12B_A2_5B,
MODEL_16B_A1B,
MODEL_21B_A3B, MODEL_30B_A3B,
MODEL_33B_A3B,
MODEL_35B_A3B,
MODEL_80B_A3B, MODEL_80B_A13B,
MODEL_100B_A6B,
MODEL_106B_A12B,
MODEL_119B_A6B,
MODEL_122B_A10B,
MODEL_230B_A10B, MODEL_235B_A22B,
MODEL_310B_A15B,
MODEL_300B_A47B, MODEL_355B_A32B,
MODEL_397B_A17B, MODEL_744B_A40B,
MODEL_E2B,
MODEL_E4B,
};
struct llama_layer_nextn {
struct ggml_tensor * eh_proj = nullptr;
struct ggml_tensor * embed_tokens = nullptr;
struct ggml_tensor * enorm = nullptr;
struct ggml_tensor * hnorm = nullptr;
struct ggml_tensor * shared_head_head = nullptr;
struct ggml_tensor * shared_head_norm = nullptr;
};
struct llama_layer {
struct ggml_tensor * attn_norm = nullptr;
struct ggml_tensor * attn_norm_b = nullptr;
struct ggml_tensor * attn_norm_2 = nullptr;
struct ggml_tensor * attn_norm_2_b = nullptr;
struct ggml_tensor * attn_q_norm = nullptr;
struct ggml_tensor * attn_q_norm_b = nullptr;
struct ggml_tensor * attn_k_norm = nullptr;
struct ggml_tensor * attn_k_norm_b = nullptr;
struct ggml_tensor * attn_out_norm = nullptr;
struct ggml_tensor * attn_out_norm_b = nullptr;
struct ggml_tensor * attn_q_a_norm = nullptr;
struct ggml_tensor * attn_kv_a_norm = nullptr;
struct ggml_tensor * attn_sub_norm = nullptr;
struct ggml_tensor * attn_post_norm = nullptr;
struct ggml_tensor * ffn_sub_norm = nullptr;
struct ggml_tensor * attn_norm_cross = nullptr;
struct ggml_tensor * attn_norm_enc = nullptr;
struct ggml_tensor * wqkv_gate = nullptr;
struct ggml_tensor * ffn_post_norm_1 = nullptr; struct ggml_tensor * ffn_post_norm_2 = nullptr; struct ggml_tensor * ffn_pre_norm_2 = nullptr; struct ggml_tensor * ffn_gate_inp_s = nullptr;
struct ggml_tensor * wq = nullptr;
struct ggml_tensor * wk = nullptr;
struct ggml_tensor * wv = nullptr;
struct ggml_tensor * wo = nullptr;
struct ggml_tensor * wqkv = nullptr;
struct ggml_tensor * wqk = nullptr;
struct ggml_tensor * wkv = nullptr;
struct ggml_tensor * wq_a = nullptr;
struct ggml_tensor * wq_b = nullptr;
struct ggml_tensor * wkv_a_mqa = nullptr;
struct ggml_tensor * wkq_a_mqa = nullptr;
struct ggml_tensor * wkv_b = nullptr;
struct ggml_tensor * wk_b = nullptr;
struct ggml_tensor * wk_b_pp = nullptr;
struct ggml_tensor * wv_b = nullptr;
struct ggml_tensor * wq_cross = nullptr;
struct ggml_tensor * wk_cross = nullptr;
struct ggml_tensor * wv_cross = nullptr;
struct ggml_tensor * wo_cross = nullptr;
struct ggml_tensor * wq_enc = nullptr;
struct ggml_tensor * wk_enc = nullptr;
struct ggml_tensor * wv_enc = nullptr;
struct ggml_tensor * wo_enc = nullptr;
struct ggml_tensor * attn_sinks = nullptr;
struct ggml_tensor * bq = nullptr;
struct ggml_tensor * bk = nullptr;
struct ggml_tensor * bv = nullptr;
struct ggml_tensor * bo = nullptr;
struct ggml_tensor * bqkv = nullptr;
struct ggml_tensor * bqk = nullptr;
struct ggml_tensor * bkv = nullptr;
llama_split_tensor split_attn_norm;
llama_split_tensor split_attn_post_norm;
llama_split_tensor split_attn_sinks;
llama_split_tensor split_wq;
llama_split_tensor split_wk;
llama_split_tensor split_wv;
llama_split_tensor split_wo;
llama_split_tensor split_wqkv;
llama_split_tensor split_wqk;
llama_split_tensor split_wkv;
llama_split_tensor split_bq;
llama_split_tensor split_bk;
llama_split_tensor split_bv;
llama_split_tensor split_bo;
llama_split_tensor split_bqkv;
llama_split_tensor split_bqk;
llama_split_tensor split_bkv;
llama_split_tensor split_q_norm;
llama_split_tensor split_k_norm;
llama_split_tensor split_sinks;
llama_split_tensor split_wqkv_gate;
llama_split_tensor split_wq_a;
llama_split_tensor split_wq_b;
llama_split_tensor split_wkv_a_mqa;
llama_split_tensor split_wk_b;
llama_split_tensor split_wk_b_pp;
llama_split_tensor split_wv_b;
llama_split_tensor split_attn_q_a_norm;
llama_split_tensor split_attn_kv_a_norm;
llama_split_tensor split_ssm_wqkv;
llama_split_tensor split_ssm_wqkv_gate;
llama_split_tensor split_ssm_in;
llama_split_tensor split_ssm_conv1d;
llama_split_tensor split_ssm_dt;
llama_split_tensor split_ssm_a;
llama_split_tensor split_ssm_beta_alpha;
llama_split_tensor split_ssm_beta;
llama_split_tensor split_ssm_alpha;
llama_split_tensor split_ssm_norm;
llama_split_tensor split_ssm_out;
struct ggml_tensor * attn_rel_b = nullptr;
struct ggml_tensor * attn_rel_b_enc = nullptr;
struct ggml_tensor * attn_rel_b_cross = nullptr;
struct ggml_tensor * ffn_norm = nullptr;
struct ggml_tensor * ffn_norm_b = nullptr;
struct ggml_tensor * ffn_post_norm = nullptr;
struct ggml_tensor * layer_out_norm = nullptr;
struct ggml_tensor * layer_out_norm_b = nullptr;
struct ggml_tensor * ffn_norm_exps = nullptr;
struct ggml_tensor * ffn_norm_enc = nullptr;
struct ggml_tensor * ffn_gate = nullptr; struct ggml_tensor * ffn_down = nullptr; struct ggml_tensor * ffn_up = nullptr; struct ggml_tensor * ffn_gate_enc = nullptr;
struct ggml_tensor * ffn_down_enc = nullptr;
struct ggml_tensor * ffn_up_enc = nullptr;
llama_split_tensor split_ffn_up;
llama_split_tensor split_ffn_gate;
llama_split_tensor split_ffn_down;
llama_split_tensor split_ffn_norm;
llama_split_tensor split_ffn_up_gate;
llama_split_tensor split_ffn_post_norm;
llama_split_tensor split_ffn_post_norm_1;
llama_split_tensor split_ffn_post_norm_2;
llama_split_tensor split_ffn_pre_norm_2;
struct ggml_tensor * ffn_gate_inp = nullptr;
struct ggml_tensor * ffn_gate_exps = nullptr;
struct ggml_tensor * ffn_down_exps = nullptr;
struct ggml_tensor * ffn_up_exps = nullptr;
struct ggml_tensor * ffn_up_gate_exps = nullptr;
llama_split_tensor split_ffn_gate_inp;
llama_split_tensor split_ffn_up_exps;
llama_split_tensor split_ffn_gate_exps;
llama_split_tensor split_ffn_down_exps;
llama_split_tensor split_ffn_up_gate_exps;
struct ggml_tensor * ffn_gate_inp_b = nullptr;
struct ggml_tensor * ffn_gate_exps_b = nullptr;
struct ggml_tensor * ffn_down_exps_b = nullptr;
struct ggml_tensor * ffn_up_exps_b = nullptr;
struct ggml_tensor * ffn_up_gate_exps_b = nullptr;
struct ggml_tensor * ffn_gate_exps_b_dup = nullptr;
struct ggml_tensor * ffn_down_exps_b_dup = nullptr;
struct ggml_tensor * ffn_up_exps_b_dup = nullptr;
struct ggml_tensor * ffn_down_exps_s = nullptr;
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
struct ggml_tensor * ffn_gate_shexp = nullptr;
struct ggml_tensor * ffn_down_shexp = nullptr;
struct ggml_tensor * ffn_up_shexp = nullptr;
llama_split_tensor split_ffn_up_shexp;
llama_split_tensor split_ffn_gate_shexp;
llama_split_tensor split_ffn_down_shexp;
llama_split_tensor split_ffn_gate_inp_shexp;
llama_split_tensor split_ffn_gate_inp_b;
llama_split_tensor split_ffn_gate_exps_b;
llama_split_tensor split_ffn_down_exps_b;
llama_split_tensor split_ffn_up_exps_b;
llama_split_tensor split_ffn_up_gate_exps_b;
llama_split_tensor split_ffn_down_exps_s;
llama_split_tensor split_ffn_gate_inp_s;
struct ggml_tensor * ffn_gate_b = nullptr;
struct ggml_tensor * ffn_down_b = nullptr; struct ggml_tensor * ffn_up_b = nullptr; struct ggml_tensor * ffn_act = nullptr;
struct ggml_tensor * ffn_exp_probs_b = nullptr;
llama_split_tensor split_ffn_gate_b;
llama_split_tensor split_ffn_down_b;
llama_split_tensor split_ffn_up_b;
llama_split_tensor split_ffn_act;
llama_split_tensor split_ffn_exp_probs_b;
struct ggml_tensor * per_layer_inp_gate;
struct ggml_tensor * per_layer_proj;
struct ggml_tensor * per_layer_post_norm;
struct ggml_tensor * ssm_in = nullptr;
struct ggml_tensor * ssm_x = nullptr;
struct ggml_tensor * ssm_dt = nullptr;
struct ggml_tensor * ssm_out = nullptr;
struct ggml_tensor * ssm_norm = nullptr;
struct ggml_tensor * ssm_beta_alpha = nullptr;
struct ggml_tensor * ssm_alpha = nullptr;
struct ggml_tensor * ssm_beta = nullptr;
struct ggml_tensor * ssm_conv1d = nullptr;
struct ggml_tensor * ssm_a = nullptr;
struct ggml_tensor * ssm_d = nullptr;
struct ggml_tensor * ssm_conv1d_b = nullptr;
struct ggml_tensor * ssm_dt_b = nullptr;
struct ggml_tensor * indexer_k_norm = nullptr;
struct ggml_tensor * indexer_k_norm_b = nullptr;
struct ggml_tensor * indexer_proj = nullptr;
struct ggml_tensor * indexer_attn_k = nullptr;
struct ggml_tensor * indexer_attn_q_b = nullptr;
struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr;
struct ggml_tensor * rope_freqs = nullptr;
llama_split_tensor split_rope_freqs;
struct ggml_tensor * wq_scale = nullptr;
struct ggml_tensor * wk_scale = nullptr;
struct ggml_tensor * wv_scale = nullptr;
struct ggml_tensor * wo_scale = nullptr;
struct ggml_tensor * ffn_gate_scale = nullptr;
struct ggml_tensor * ffn_up_scale = nullptr;
struct ggml_tensor * ffn_down_scale = nullptr;
struct ggml_tensor * out_scale = nullptr;
llama_split_tensor split_out_scale;
struct llama_layer_nextn nextn;
std::unique_ptr<ggml_tensor> computed_wk_b;
std::unique_ptr<ggml_tensor> computed_wk_b_pp;
std::unique_ptr<ggml_tensor> computed_wv_b;
std::unique_ptr<ggml_tensor> computed_wkv_b;
std::vector<std::unique_ptr<ggml_tensor>> computed_wk_b_replicas;
std::vector<std::unique_ptr<ggml_tensor>> computed_wk_b_pp_replicas;
std::vector<std::unique_ptr<ggml_tensor>> computed_wv_b_replicas;
};
struct llama_lora_adapter;
struct rpc_device {
std::string endpoint;
uint32_t device;
};
struct llama_cparams;
struct llama_model {
e_model type = MODEL_UNKNOWN;
llm_arch arch = LLM_ARCH_UNKNOWN;
llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
std::string name = "n/a";
llama_hparams hparams = {};
llama_vocab vocab;
struct ggml_tensor * tok_embd;
struct ggml_tensor * type_embd;
struct ggml_tensor * pos_embd;
struct ggml_tensor * tok_norm;
struct ggml_tensor * tok_norm_b;
struct ggml_tensor * tok_embd_per_layer = nullptr;
struct ggml_tensor * per_layer_model_proj = nullptr;
struct ggml_tensor * per_layer_proj_norm = nullptr;
struct ggml_tensor * mtp_pre_proj = nullptr;
struct ggml_tensor * mtp_post_proj = nullptr;
struct ggml_tensor * mtp_token_ordering = nullptr;
struct ggml_tensor * mtp_centroids = nullptr;
struct ggml_tensor * output_norm;
struct ggml_tensor * output_norm_b;
struct ggml_tensor * output;
struct ggml_tensor * output_b;
struct ggml_tensor * output_norm_enc;
struct ggml_tensor * output_mtp = nullptr;
std::unique_ptr<ggml_tensor> output_mtp_ptr;
llama_split_tensor split_output;
llama_split_tensor split_output_norm;
std::vector<llama_layer> layers;
llama_split_mode split_mode;
int main_gpu;
int max_gpu = 0; int n_gpu_layers;
bool mtp;
std::vector<rpc_device> rpc_servers;
std::vector<int32_t> devices;
std::vector<int32_t> default_layer_device;
std::vector<float> aux_buffer;
std::unordered_map<std::string, std::string> gguf_kv;
struct layer_buft {
layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
ggml_backend_buffer_type_t buft_matrix; ggml_backend_buffer_type_t buft; };
layer_buft buft_input;
layer_buft buft_output;
std::vector<layer_buft> buft_layer;
std::vector<struct ggml_context *> ctxs;
std::vector<ggml_backend_buffer_t> bufs;
llama_mmaps mappings;
llama_mlocks mlock_bufs;
llama_mlocks mlock_mmaps;
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
int64_t t_load_us = 0;
int64_t t_start_us = 0;
std::set<llama_lora_adapter *> lora_adapters;
bool tensor_overrides;
bool khad_pretransformed = false;
~llama_model();
size_t max_nodes(int n_tokens) const {
auto n_tensors = tensors_by_name.size();
if (split_mode == LLAMA_SPLIT_MODE_GRAPH && !devices.empty()) n_tensors *= devices.size();
if (arch == LLM_ARCH_QWEN3NEXT || arch == LLM_ARCH_QWEN35MOE || arch == LLM_ARCH_QWEN35) {
return std::max<size_t>(n_tokens * 40, 32u * n_tensors);
}
return 65536;
}
bool has_tensor_overrides() const {
return tensor_overrides;
}
bool is_mla_model() const {
return arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA || arch == LLM_ARCH_MISTRAL4;
}
static inline int hadamard_size(int head_size) {
if ((head_size & ~(head_size - 1)) == head_size) return head_size;
for (int i = 512; i >= 64; i >>= 1) {
if (head_size % i == 0) return i;
}
return 0;
}
inline int hadamard_size_k(int il) const {
if (is_mla_model()) return 64;
return hadamard_size(hparams.n_embd_head_k(il));
}
inline int hadamard_size_v(int il) const {
if (is_mla_model()) return 64;
return hadamard_size(hparams.n_embd_head_v(il));
}
size_t cache_size(int il, ggml_type type_k, ggml_type type_v, uint32_t kv_size, int mla_attn, int n_seq_max, bool flash_attn) const;
void set_tensor_overrides(const llama_model_params& params);
int device_count() const;
ggml_backend_buffer_type_t default_buffer_type_offload(int device) const;
std::vector<float> splits;
ggml_backend_buffer_type_t split_buft = nullptr;
};
struct llama_lora_weight {
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;
llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b): a(a), b(b) {}
};
struct llama_lora_adapter {
llama_model * base_model;
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
std::vector<struct ggml_context *> ctxs;
std::vector<ggml_backend_buffer_t> bufs;
float alpha;
llama_lora_adapter(struct llama_model * base_model): base_model(base_model) {
base_model->lora_adapters.insert(this);
}
llama_lora_weight * get_weight(struct ggml_tensor * w) {
std::string name(w->name);
auto pos = ab_map.find(name);
if (ab_map.find(name) != ab_map.end()) {
return &pos->second;
}
return nullptr;
}
~llama_lora_adapter() {
for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
ggml_backend_buffer_free(buf);
}
auto pos = base_model->lora_adapters.find(this);
if (pos != base_model->lora_adapters.end()) {
base_model->lora_adapters.erase(pos);
}
}
};
struct LLM_TN {
LLM_TN(llm_arch arch) : arch(arch) {}
llm_arch arch;
std::string operator()(llm_tensor tensor) const;
std::string operator()(llm_tensor tensor, const std::string & suffix) const;
std::string operator()(llm_tensor tensor, int bid) const;
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const;
std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const;
};
std::string llama_model_ftype_name(llama_ftype ftype);
const char * llama_model_type_name(e_model type);