autoagents-llamacpp 0.3.7

//! Configuration structures for llama.cpp provider.

use crate::models::ModelSource;
use llama_cpp_2::model::params::LlamaSplitMode;
use serde::{Deserialize, Serialize};

/// Serializable split mode wrapper for llama.cpp.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum LlamaCppSplitMode {
    /// Single device.
    None,
    /// Split layers and KV across GPUs.
    Layer,
    /// Split layers and KV across GPUs, use tensor parallelism if supported.
    Row,
}

impl From<LlamaCppSplitMode> for LlamaSplitMode {
    fn from(value: LlamaCppSplitMode) -> Self {
        match value {
            LlamaCppSplitMode::None => LlamaSplitMode::None,
            LlamaCppSplitMode::Layer => LlamaSplitMode::Layer,
            LlamaCppSplitMode::Row => LlamaSplitMode::Row,
        }
    }
}

/// Reasoning extraction format for llama.cpp OpenAI-compatible parsing.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum LlamaCppReasoningFormat {
    /// Disable reasoning extraction into `reasoning_content`.
    None,
    /// Let llama.cpp auto-detect the model/template strategy.
    Auto,
    /// Parse DeepSeek/Qwen-style thinking into `reasoning_content`.
    Deepseek,
    /// Legacy DeepSeek behavior.
    DeepseekLegacy,
}

impl LlamaCppReasoningFormat {
    /// Convert to llama.cpp reasoning format string.
    pub fn as_str(self) -> Option<&'static str> {
        match self {
            Self::None => None,
            Self::Auto => Some("auto"),
            Self::Deepseek => Some("deepseek"),
            Self::DeepseekLegacy => Some("deepseek_legacy"),
        }
    }
}

/// Complete configuration for LlamaCppProvider.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LlamaCppConfig {
    /// Model source (GGUF path).
    pub model_source: ModelSource,

    /// Optional chat template name or inline template.
    pub chat_template: Option<String>,

    /// Optional system prompt to prepend if no system message exists.
    pub system_prompt: Option<String>,

    /// Force JSON grammar enforcement even without a structured output schema.
    pub force_json_grammar: bool,

    /// Reasoning extraction mode for structured `reasoning_content`.
    pub reasoning_format: Option<LlamaCppReasoningFormat>,

    /// Optional `chat_template_kwargs` object passed to llama.cpp's OpenAI template API.
    ///
    /// Expected shape:
    /// `{ "chat_template_kwargs": { ... } }`
    pub extra_body: Option<serde_json::Value>,

    /// Optional HuggingFace cache directory (defaults to HF_HOME or ~/.cache/huggingface/hub).
    pub model_dir: Option<String>,

    /// Optional HuggingFace filename override (GGUF file).
    pub hf_filename: Option<String>,

    /// Optional HuggingFace revision (defaults to "main").
    pub hf_revision: Option<String>,

    /// Optional multimodal projection file for MTMD models.
    pub mmproj_path: Option<String>,

    /// Optional MTMD media marker override.
    pub media_marker: Option<String>,

    /// Enable GPU offload for MTMD projection.
    pub mmproj_use_gpu: Option<bool>,

    /// Maximum tokens to generate.
    pub max_tokens: Option<u32>,

    /// Sampling temperature (0.0 - 2.0).
    pub temperature: Option<f32>,

    /// Top-p sampling parameter.
    pub top_p: Option<f32>,

    /// Top-k sampling parameter.
    pub top_k: Option<u32>,

    /// Repeat penalty (1.0 disables).
    pub repeat_penalty: Option<f32>,

    /// Penalize frequency of tokens (0.0 disables).
    pub frequency_penalty: Option<f32>,

    /// Penalize presence of tokens (0.0 disables).
    pub presence_penalty: Option<f32>,

    /// Number of tokens to consider for penalties (None = default 64).
    pub repeat_last_n: Option<i32>,

    /// RNG seed for sampling.
    pub seed: Option<u32>,

    /// Context size override.
    pub n_ctx: Option<u32>,

    /// Batch size override.
    pub n_batch: Option<u32>,

    /// Micro-batch size override.
    pub n_ubatch: Option<u32>,

    /// Number of threads for prompt evaluation.
    pub n_threads: Option<i32>,

    /// Number of threads for batch evaluation.
    pub n_threads_batch: Option<i32>,

    /// Number of GPU layers to offload.
    pub n_gpu_layers: Option<u32>,

    /// Main GPU index.
    pub main_gpu: Option<i32>,

    /// Split mode for multi-GPU.
    pub split_mode: Option<LlamaCppSplitMode>,

    /// Enable memory lock (mlock) if supported.
    pub use_mlock: Option<bool>,

    /// Explicit device indices for offload.
    pub devices: Option<Vec<usize>>,
}

impl Default for LlamaCppConfig {
    fn default() -> Self {
        Self {
            model_source: ModelSource::Gguf {
                model_path: String::default(),
            },
            chat_template: None,
            system_prompt: None,
            force_json_grammar: false,
            reasoning_format: None,
            extra_body: None,
            model_dir: None,
            hf_filename: None,
            hf_revision: None,
            mmproj_path: None,
            media_marker: None,
            mmproj_use_gpu: None,
            max_tokens: Some(512),
            temperature: Some(0.7),
            top_p: None,
            top_k: None,
            repeat_penalty: None,
            frequency_penalty: None,
            presence_penalty: None,
            repeat_last_n: None,
            seed: None,
            n_ctx: None,
            n_batch: None,
            n_ubatch: None,
            n_threads: None,
            n_threads_batch: None,
            n_gpu_layers: None,
            main_gpu: None,
            split_mode: None,
            use_mlock: None,
            devices: None,
        }
    }
}

/// Builder for LlamaCppConfig.
#[derive(Debug, Default)]
pub struct LlamaCppConfigBuilder {
    config: LlamaCppConfig,
}

impl LlamaCppConfigBuilder {
    /// Create a new builder with default configuration.
    pub fn new() -> Self {
        Self::default()
    }

    /// Set the model source.
    pub fn model_source(mut self, source: ModelSource) -> Self {
        self.config.model_source = source;
        self
    }

    /// Set the model path for a local GGUF model.
    pub fn model_path(mut self, path: impl Into<String>) -> Self {
        self.config.model_source = ModelSource::gguf(path);
        self
    }

    /// Set chat template.
    pub fn chat_template(mut self, template: impl Into<String>) -> Self {
        self.config.chat_template = Some(template.into());
        self
    }

    /// Set system prompt.
    pub fn system_prompt(mut self, prompt: impl Into<String>) -> Self {
        self.config.system_prompt = Some(prompt.into());
        self
    }

    /// Force JSON grammar enforcement even without a structured output schema.
    pub fn force_json_grammar(mut self, force: bool) -> Self {
        self.config.force_json_grammar = force;
        self
    }

    /// Set reasoning extraction format.
    pub fn reasoning_format(mut self, format: LlamaCppReasoningFormat) -> Self {
        self.config.reasoning_format = Some(format);
        self
    }

    /// Set optional `chat_template_kwargs` payload for llama.cpp OpenAI template rendering.
    pub fn extra_body(mut self, extra_body: impl Serialize) -> Self {
        self.config.extra_body = serde_json::to_value(extra_body).ok();
        self
    }

    /// Set the HuggingFace cache directory.
    pub fn model_dir(mut self, dir: impl Into<String>) -> Self {
        self.config.model_dir = Some(dir.into());
        self
    }

    /// Set the HuggingFace filename (GGUF file).
    pub fn hf_filename(mut self, filename: impl Into<String>) -> Self {
        self.config.hf_filename = Some(filename.into());
        self
    }

    /// Set the HuggingFace revision.
    pub fn hf_revision(mut self, revision: impl Into<String>) -> Self {
        self.config.hf_revision = Some(revision.into());
        self
    }

    /// Set the multimodal projection (mmproj) file path.
    pub fn mmproj_path(mut self, path: impl Into<String>) -> Self {
        self.config.mmproj_path = Some(path.into());
        self
    }

    /// Set MTMD media marker.
    pub fn media_marker(mut self, marker: impl Into<String>) -> Self {
        self.config.media_marker = Some(marker.into());
        self
    }

    /// Enable or disable GPU offload for MTMD projection.
    pub fn mmproj_use_gpu(mut self, use_gpu: bool) -> Self {
        self.config.mmproj_use_gpu = Some(use_gpu);
        self
    }

    /// Set maximum tokens to generate.
    pub fn max_tokens(mut self, tokens: u32) -> Self {
        self.config.max_tokens = Some(tokens);
        self
    }

    /// Set sampling temperature.
    pub fn temperature(mut self, temp: f32) -> Self {
        self.config.temperature = Some(temp);
        self
    }

    /// Set top-p sampling parameter.
    pub fn top_p(mut self, p: f32) -> Self {
        self.config.top_p = Some(p);
        self
    }

    /// Set top-k sampling parameter.
    pub fn top_k(mut self, k: u32) -> Self {
        self.config.top_k = Some(k);
        self
    }

    /// Set repeat penalty.
    pub fn repeat_penalty(mut self, penalty: f32) -> Self {
        self.config.repeat_penalty = Some(penalty);
        self
    }

    /// Set frequency penalty.
    pub fn frequency_penalty(mut self, penalty: f32) -> Self {
        self.config.frequency_penalty = Some(penalty);
        self
    }

    /// Set presence penalty.
    pub fn presence_penalty(mut self, penalty: f32) -> Self {
        self.config.presence_penalty = Some(penalty);
        self
    }

    /// Set repeat last N for penalties.
    pub fn repeat_last_n(mut self, last_n: i32) -> Self {
        self.config.repeat_last_n = Some(last_n);
        self
    }

    /// Set sampling seed.
    pub fn seed(mut self, seed: u32) -> Self {
        self.config.seed = Some(seed);
        self
    }

    /// Set context size.
    pub fn n_ctx(mut self, n_ctx: u32) -> Self {
        self.config.n_ctx = Some(n_ctx);
        self
    }

    /// Set batch size.
    pub fn n_batch(mut self, n_batch: u32) -> Self {
        self.config.n_batch = Some(n_batch);
        self
    }

    /// Set micro-batch size.
    pub fn n_ubatch(mut self, n_ubatch: u32) -> Self {
        self.config.n_ubatch = Some(n_ubatch);
        self
    }

    /// Set number of threads for prompt evaluation.
    pub fn n_threads(mut self, n_threads: i32) -> Self {
        self.config.n_threads = Some(n_threads);
        self
    }

    /// Set number of threads for batch evaluation.
    pub fn n_threads_batch(mut self, n_threads: i32) -> Self {
        self.config.n_threads_batch = Some(n_threads);
        self
    }

    /// Set number of GPU layers to offload.
    pub fn n_gpu_layers(mut self, layers: u32) -> Self {
        self.config.n_gpu_layers = Some(layers);
        self
    }

    /// Set main GPU index.
    pub fn main_gpu(mut self, main_gpu: i32) -> Self {
        self.config.main_gpu = Some(main_gpu);
        self
    }

    /// Set split mode.
    pub fn split_mode(mut self, mode: LlamaCppSplitMode) -> Self {
        self.config.split_mode = Some(mode);
        self
    }

    /// Enable memory lock.
    pub fn use_mlock(mut self, use_mlock: bool) -> Self {
        self.config.use_mlock = Some(use_mlock);
        self
    }

    /// Set explicit device indices for offload.
    pub fn devices(mut self, devices: Vec<usize>) -> Self {
        self.config.devices = Some(devices);
        self
    }

    /// Build the configuration.
    pub fn build(self) -> LlamaCppConfig {
        self.config
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_config_builder_basic() {
        let config = LlamaCppConfigBuilder::default()
            .model_path("model.gguf")
            .max_tokens(1024)
            .temperature(0.8)
            .build();

        assert_eq!(
            config.model_source,
            ModelSource::Gguf {
                model_path: "model.gguf".to_string(),
            }
        );
        assert_eq!(config.max_tokens, Some(1024));
        assert_eq!(config.temperature, Some(0.8));
    }

    #[test]
    fn test_config_builder_optional_flags() {
        let config = LlamaCppConfigBuilder::default()
            .model_path("model.gguf")
            .force_json_grammar(true)
            .reasoning_format(LlamaCppReasoningFormat::Deepseek)
            .extra_body(serde_json::json!({
                "chat_template_kwargs": {
                    "enable_thinking": true
                }
            }))
            .mmproj_use_gpu(true)
            .split_mode(LlamaCppSplitMode::Layer)
            .use_mlock(true)
            .devices(vec![0, 1])
            .build();

        assert!(config.force_json_grammar);
        assert_eq!(
            config.reasoning_format,
            Some(LlamaCppReasoningFormat::Deepseek)
        );
        assert_eq!(
            config
                .extra_body
                .as_ref()
                .and_then(|v| v.get("chat_template_kwargs"))
                .and_then(|v| v.get("enable_thinking"))
                .and_then(|v| v.as_bool()),
            Some(true)
        );
        assert_eq!(config.mmproj_use_gpu, Some(true));
        assert_eq!(config.split_mode, Some(LlamaCppSplitMode::Layer));
        assert_eq!(config.use_mlock, Some(true));
        assert_eq!(config.devices, Some(vec![0, 1]));
    }

    #[test]
    fn test_config_default_reasoning_format_is_opt_in() {
        let config = LlamaCppConfig::default();
        assert_eq!(config.reasoning_format, None);
    }

    #[test]
    fn test_config_builder_selected_options() {
        let config = LlamaCppConfigBuilder::default()
            .model_source(ModelSource::huggingface_with_filename(
                "org/model",
                "model.gguf",
            ))
            .chat_template("chat-template")
            .system_prompt("system")
            .model_dir("cache")
            .hf_filename("override.gguf")
            .hf_revision("rev1")
            .mmproj_path("mmproj.gguf")
            .media_marker("[IMG]")
            .max_tokens(123)
            .temperature(0.5)
            .top_p(0.9)
            .top_k(42)
            .repeat_penalty(1.1)
            .frequency_penalty(0.2)
            .presence_penalty(0.3)
            .repeat_last_n(32)
            .seed(7)
            .n_ctx(2048)
            .n_batch(64)
            .n_ubatch(8)
            .n_threads(4)
            .n_threads_batch(2)
            .n_gpu_layers(3)
            .main_gpu(1)
            .build();

        assert!(matches!(
            config.model_source,
            ModelSource::HuggingFace { .. }
        ));
        assert_eq!(config.chat_template.as_deref(), Some("chat-template"));
        assert_eq!(config.system_prompt.as_deref(), Some("system"));
        assert_eq!(config.model_dir.as_deref(), Some("cache"));
        assert_eq!(config.hf_filename.as_deref(), Some("override.gguf"));
        assert_eq!(config.hf_revision.as_deref(), Some("rev1"));
        assert_eq!(config.mmproj_path.as_deref(), Some("mmproj.gguf"));
        assert_eq!(config.media_marker.as_deref(), Some("[IMG]"));
        assert_eq!(config.max_tokens, Some(123));
        assert_eq!(config.temperature, Some(0.5));
        assert_eq!(config.n_ctx, Some(2048));
        assert_eq!(config.n_threads, Some(4));
        assert_eq!(config.n_gpu_layers, Some(3));
        assert_eq!(config.main_gpu, Some(1));
    }
}