#pragma once
#include "llama-impl.h"
#include "llama-cparams.h"
#include "llama-sampling.h"
#include "llama-spec-features.h"
struct llama_model;
#include <vector>
#include <map>
#include <set>
#include <memory>
struct llama_kv_cell {
llama_pos pos = -1;
llama_pos delta = 0;
int32_t src = 0;
std::set<llama_seq_id> seq_id;
bool has_seq_id(const llama_seq_id & id) const {
return seq_id.find(id) != seq_id.end();
}
bool is_empty() const {
return seq_id.empty();
}
bool is_same_seq(const llama_kv_cell & other) const {
return seq_id == other.seq_id;
}
};
struct llama_kv_cache {
bool has_shift = false;
bool do_defrag = false;
bool do_copy = false;
bool recurrent = false; bool hybrid = false;
bool v_trans = true;
uint32_t head = 0;
uint32_t size = 0;
uint32_t used = 0;
uint32_t n = 0;
ggml_type type_k = GGML_TYPE_F16;
ggml_type type_v = GGML_TYPE_F16;
std::vector<llama_kv_cell> cells;
std::vector<struct ggml_tensor *> k_l; std::vector<struct ggml_tensor *> v_l;
std::vector<struct ggml_tensor *> s_l;
bool save_per_step_ssm = false;
std::vector<llama_split_tensor> split_k_l;
std::vector<llama_split_tensor> split_v_l;
std::vector<llama_split_tensor> split_s_l;
std::vector<llama_split_tensor> replicated_k_l;
std::vector<struct ggml_context *> ctxs;
std::vector<ggml_backend_buffer_t> bufs;
size_t total_size() const {
size_t size = 0;
for (ggml_backend_buffer_t buf : bufs) {
size += ggml_backend_buffer_get_size(buf);
}
return size;
}
struct gpu_checkpoint {
std::vector<llama_kv_cell> cells_snapshot;
uint32_t head_snapshot = 0;
uint32_t used_snapshot = 0;
std::vector<ggml_tensor *> s_l_shadow;
std::vector<std::vector<ggml_tensor *>> split_s_l_shadow;
std::vector<std::vector<ggml_tensor *>> per_step_ssm;
std::vector<std::vector<ggml_tensor *>> per_step_conv;
int32_t per_step_n_tokens = 0;
int32_t per_step_max_allocated = 0;
int64_t per_step_ssm_state_size = 0;
int64_t per_step_conv_state_dim = 0;
int64_t per_step_conv_dim = 0;
int32_t per_step_d_conv = 0;
int selected_spec_mode = -1;
int fixed_spec_mode = LLAMA_SPEC_CKPT_NONE;
int32_t fixed_max_tokens = 0;
std::vector<uint8_t> cpu_state_data;
std::vector<struct ggml_context *> per_step_ctxs;
std::vector<ggml_backend_buffer_t> per_step_bufs;
std::vector<struct ggml_context *> shadow_ctxs;
std::vector<ggml_backend_buffer_t> shadow_bufs;
bool allocated = false;
bool shadow_conv_only = false;
bool saved = false;
~gpu_checkpoint() {
for (struct ggml_context * ctx : shadow_ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : shadow_bufs) {
ggml_backend_buffer_free(buf);
}
for (struct ggml_context * ctx : per_step_ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : per_step_bufs) {
ggml_backend_buffer_free(buf);
}
}
};
gpu_checkpoint ckpt;
bool checkpoint_alloc_shadows(bool conv_only_shadow = false);
bool checkpoint_supported() const;
bool checkpoint_save(ggml_backend_sched_t sched);
bool checkpoint_restore(ggml_backend_sched_t sched);
void checkpoint_delete();
bool per_step_alloc(const llama_model & model, int max_tokens);
bool per_step_restore(const llama_model & model, ggml_backend_sched_t sched, int step);
~llama_kv_cache() {
for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
ggml_backend_buffer_free(buf);
}
}
};
struct llama_control_vector {
std::vector<struct ggml_tensor *> tensors; std::vector<struct ggml_context *> ctxs;
std::vector<ggml_backend_buffer_t> bufs;
int32_t layer_start = -1;
int32_t layer_end = -1;
struct ggml_tensor * tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}
return tensors[il];
}
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir);
}
return cur;
}
~llama_control_vector() {
for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
ggml_backend_buffer_free(buf);
}
}
};
struct llama_context {
llama_context(const llama_model & model);
~llama_context();
const struct llama_model & model;
struct llama_cparams cparams;
struct llama_sampling sampling;
struct llama_kv_cache kv_self;
struct llama_context * mtp_target_ctx = nullptr;
struct llama_control_vector cvec;
std::vector<float> scale_data;
std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
std::vector<ggml_backend_t> backends;
#ifdef GGML_USE_METAL
ggml_backend_t backend_metal = nullptr;
#endif
#ifdef GGML_USE_BLAS
ggml_backend_t backend_blas = nullptr;
#endif
ggml_backend_t backend_cpu = nullptr;
bool has_evaluated_once = false;
int64_t t_start_us;
int64_t t_load_us;
int64_t t_p_eval_us = 0;
int64_t t_eval_us = 0;
int64_t t_compute_start_us = 0;
int64_t n_queued_tokens = 0;
int32_t n_p_eval = 0; int32_t n_eval = 0;
ggml_backend_buffer_t buf_output = nullptr;
size_t logits_size = 0; float * logits = nullptr;
std::vector<int32_t> output_ids; size_t output_size = 0; int32_t n_outputs = 0; int32_t n_outputs_embd = 0;
bool logits_all = false;
size_t embd_size = 0; float * embd = nullptr;
std::map<llama_seq_id, std::vector<float>> embd_seq;
bool is_encoding = false;
std::vector<float> embd_enc;
std::vector<std::set<llama_seq_id>> seq_ids_enc;
std::vector<uint8_t> buf_compute_meta;
ggml_backend_sched_t sched = nullptr;
ggml_abort_callback abort_callback = nullptr;
void * abort_callback_data = nullptr;
const float * draft_input_hidden_state = nullptr;
size_t draft_input_hidden_state_n_floats = 0;
std::vector<float> draft_input_hidden_state_owned;
struct ggml_tensor * inp_tokens; struct ggml_tensor * inp_embd; struct ggml_tensor * inp_pos; struct ggml_tensor * inp_out_ids; struct ggml_tensor * inp_KQ_mask; struct ggml_tensor * inp_KQ_mask_swa; struct ggml_tensor * inp_K_shift; struct ggml_tensor * inp_mean; struct ggml_tensor * inp_cls; struct ggml_tensor * inp_s_copy; struct ggml_tensor * inp_s_mask; struct ggml_tensor * inp_s_seq; struct ggml_tensor * inp_s_seq_qnext; struct ggml_tensor * inp_pos_bucket; struct ggml_tensor * inp_embd_enc; struct ggml_tensor * inp_KQ_mask_cross; struct ggml_tensor * inp_scale = nullptr; struct ggml_tensor * inp_mtp_states = nullptr;
ggml_backend_t ggml_backend_by_name(const char * name);
struct Prev;
std::unique_ptr<Prev> prev;
std::unique_ptr<Prev> prev_mtp;
void reset_scheduler();
bool can_reuse_graph(const llama_batch & u_batch);
struct CacheCopy {
ggml_tensor * cpy = nullptr;
size_t step = 0;
};
std::vector<CacheCopy> cache_copies;
bool update_cache_copies();
bool prepare_mtp_graph_inputs(
struct llama_context & lctx);
void set_mtp_op_type(llama_mtp_op_type value);
int max_nodes(int n_tokens, int n_kv) const;
};