#include "models.h"
void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) {
throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
}
if (target_layer_ids.size() != 3) {
throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
}
LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
target_layer_ids[0],
target_layer_ids[1],
target_layer_ids[2]);
uint32_t n_embd_tgt = 0;
ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
if (hparams.norm_before_residual) {
LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__);
}
type = LLM_TYPE_UNKNOWN;
}
void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
LLAMA_LOAD_LOCALS;
const int64_t n_embd_inp = hparams.n_embd_inp();
const int64_t n_embd_attn_input = 2 * n_embd;
int64_t n_draft_vocab = n_vocab; const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
if (d2t_meta) {
n_draft_vocab = d2t_meta->ne[0]; d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0);
LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
} else {
d2t = nullptr; LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
}
fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0);
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED);
const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
if (tok_embd_meta) {
const int64_t n_target_vocab = tok_embd_meta->ne[1];
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
}
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
}
}
std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const {
switch (params.gtype) {
case LLM_GRAPH_TYPE_ENCODER:
return std::make_unique<graph<true>>(*this, params);
case LLM_GRAPH_TYPE_DEFAULT:
case LLM_GRAPH_TYPE_DECODER:
return std::make_unique<graph<false>>(*this, params);
default:
GGML_ABORT("invalid graph type");
};
}
template <>
ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
ggml_tensor * cur = nullptr;
auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
ggml_set_input(inp_target->embd);
cur = inp_target->embd;
cb(cur, "inp_embd", -1);
res->add_input(std::move(inp_target));
return cur;
}
template <>
llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
ggml_tensor * cur = nullptr;
cur = build_inp_embd_enc();
cur = build_lora_mm(model.fc, cur);
cb(cur, "fc_out", -1);
ggml_set_output(cur);
res->t_h_nextn = cur;
ggml_build_forward_expand(gf, cur);
}
template <>
llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v();
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
GGML_ASSERT(n_layer == 1);
ggml_tensor * cur;
ggml_tensor * inpL;
auto * tok_embd = model.tok_embd;
if (model.tok_embd == nullptr) {
GGML_ASSERT(cparams.ctx_other != nullptr);
const auto * model_other = llama_get_model(cparams.ctx_other);
GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
tok_embd = model_other->tok_embd;
}
auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
ggml_set_input(inp->tokens);
inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
ggml_set_input(inp->embd);
ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens);
cb(inp_embd, "inp_embd", -1);
ggml_tensor * inp_g = inp->embd;
cb(inp_g, "inp_g_embeddings", -1);
res->add_input(std::move(inp));
inpL = inp_g;
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv();
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
const int il = 0;
{
ggml_tensor * embd_norm = build_norm(inp_embd,
model.layers[il].attn_norm, NULL,
LLM_NORM_RMS, il);
cb(embd_norm, "embd_norm", il);
ggml_tensor * g_norm = build_norm(inp_g,
model.layers[il].attn_norm_2, NULL,
LLM_NORM_RMS, -1);
cb(g_norm, "g_norm", il);
ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL;
cur = ggml_concat(ctx0, embd_norm, g_norm, il);
cb(cur, "concat_embd", il);
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur_rope", il);
cb(Kcur, "Kcur_rope", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL, nullptr,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
cur = build_norm(ffn_inp,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, il);
cb(cur, "post_attn_norm", il);
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "eagle3_prenorm", il);
inpL = cur;
}
cur = inpL;
ggml_set_output(cur);
res->t_h_nextn = cur;
cur = build_norm(cur,
model.output_norm, NULL,
LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
auto * output = model.output;
if (output == nullptr) {
GGML_ASSERT(cparams.ctx_other != nullptr);
const auto * model_other = llama_get_model(cparams.ctx_other);
GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)");
output = model_other->output;
}
cur = build_lora_mm(output, cur);
if (model.d2t) {
const int64_t n_draft_vocab = cur->ne[0];
const int64_t n_outputs = cur->ne[1];
const int64_t n_vocab = (int64_t) model.vocab.n_tokens();
GGML_ASSERT(model.d2t->type == GGML_TYPE_I64);
GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab);
ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY);
cur = ggml_set_rows(ctx0, logits,
ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs),
ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1));
cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs);
}
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}