#pragma once
#include "llama.h"
#include "sampling.h"
#define LOG_NO_FILE_LINE_FUNCTION
#include "log.h"
#include <cmath>
#include <string>
#include <vector>
#include <random>
#include <thread>
#include <unordered_map>
#include <tuple>
#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
#else
#define DIRECTORY_SEPARATOR '/'
#endif
#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0)
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)
extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT;
extern char const *LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info;
int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED;
int32_t n_threads = get_num_physical_cores();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; int32_t n_ctx = 512; int32_t n_batch = 2048; int32_t n_ubatch = 512; int32_t n_keep = 0; int32_t n_draft = 5; int32_t n_chunks = -1; int32_t n_parallel = 1; int32_t n_sequences = 1; float p_split = 0.1f; int32_t n_gpu_layers = -1; int32_t n_gpu_layers_draft = -1; llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; int32_t main_gpu = 0; float tensor_split[128] = {0}; int32_t n_beams = 0; int32_t grp_attn_n = 1; int32_t grp_attn_w = 512; int32_t n_print = -1; float rope_freq_base = 0.0f; float rope_freq_scale = 0.0f; float yarn_ext_factor = -1.0f; float yarn_attn_factor = 1.0f; float yarn_beta_fast = 32.0f; float yarn_beta_slow = 1.0f; int32_t yarn_orig_ctx = 0; float defrag_thold = -1.0f;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED;
struct llama_sampling_params sparams;
std::string model = "models/7B/ggml-model-f16.gguf"; std::string model_draft = ""; std::string model_alias = "unknown"; std::string model_url = ""; std::string hf_repo = ""; std::string hf_file = ""; std::string prompt = "";
std::string prompt_file = ""; std::string path_prompt_cache = ""; std::string input_prefix = ""; std::string input_suffix = ""; std::vector<std::string> antiprompt; std::string logdir = ""; std::string lookup_cache_static = ""; std::string lookup_cache_dynamic = ""; std::string logits_file = "";
std::vector<llama_model_kv_override> kv_overrides;
std::vector<std::tuple<std::string, float>> lora_adapter; std::string lora_base = "";
std::vector<llama_control_vector_load_info> control_vectors;
int32_t control_vector_layer_start = -1; int32_t control_vector_layer_end = -1;
int ppl_stride = 0; int ppl_output_type = 0; bool hellaswag = false; size_t hellaswag_tasks = 400;
bool winogrande = false; size_t winogrande_tasks= 0;
bool multiple_choice = false; size_t multiple_choice_tasks = 0;
bool kl_divergence = false;
bool random_prompt = false; bool use_color = false; bool interactive = false; bool chatml = false; bool prompt_cache_all = false; bool prompt_cache_ro = false;
bool embedding = false; bool escape = false; bool interactive_first = false; bool multiline_input = false; bool simple_io = false; bool cont_batching = true;
bool input_prefix_bos = false; bool ignore_eos = false; bool instruct = false; bool logits_all = false; bool use_mmap = true; bool use_mlock = false; bool verbose_prompt = false; bool display_prompt = true; bool infill = false; bool dump_kv_cache = false; bool no_kv_offload = false;
std::string cache_type_k = "f16"; std::string cache_type_v = "f16";
std::string mmproj = ""; std::string image = ""; };
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
std::string get_system_info(const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input);
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params);
void llama_batch_clear(struct llama_batch & batch);
void llama_batch_add(
struct llama_batch & batch,
llama_token id,
llama_pos pos,
const std::vector<llama_seq_id> & seq_ids,
bool logits);
std::vector<llama_token> llama_tokenize(
const struct llama_context * ctx,
const std::string & text,
bool add_bos,
bool special = false);
std::vector<llama_token> llama_tokenize(
const struct llama_model * model,
const std::string & text,
bool add_bos,
bool special = false);
std::string llama_token_to_piece(
const struct llama_context * ctx,
llama_token token);
std::string llama_detokenize_spm(
llama_context * ctx,
const std::vector<llama_token> & tokens);
std::string llama_detokenize_bpe(
llama_context * ctx,
const std::vector<llama_token> & tokens);
bool llama_should_add_bos_token(const llama_model * model);
bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
void llama_embd_normalize(const float * inp, float * out, int n);
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
struct llama_control_vector_data {
int n_embd;
std::vector<float> data;
};
struct llama_control_vector_load_info {
float strength;
std::string fname;
};
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";