ai_tokenopt 0.5.9

Adaptive token optimization engine for LLM inference pipelines — compresses prompts, conversation history, tool schemas, and output streams to minimize token usage while preserving response quality.
Documentation
//! Configuration for the token optimization engine

use serde::{Deserialize, Serialize};

/// Configuration for the token optimization engine.
///
/// Controls context window budgeting, compaction triggers, summarization
/// limits, and output stream monitoring. Loaded from the `[token_optimization]`
/// section in `config.toml`.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TokenOptimizationConfig {
    /// Whether token optimization is enabled (default: true)
    #[serde(default = "default_true")]
    pub enabled: bool,

    /// Total context window size in tokens (default: 8192)
    ///
    /// Should match the `num_ctx` value configured for the active Ollama model.
    #[serde(default = "default_context_window")]
    pub context_window_tokens: u32,

    /// Fraction of context window reserved for the LLM response (default: 0.25)
    ///
    /// A value of 0.25 means 25% of the context window is kept free for output
    /// generation. The remaining 75% is available for system prompt, RAG, history,
    /// and tool definitions.
    #[serde(default = "default_response_headroom")]
    pub response_headroom_ratio: f32,

    /// Fraction of context window usage that triggers compaction (default: 0.70)
    ///
    /// When the estimated input tokens exceed this ratio of the available budget
    /// (context window minus response headroom), conversation history is compacted.
    #[serde(default = "default_compaction_trigger")]
    pub compaction_trigger_ratio: f32,

    /// Maximum tokens for the rolling conversation summary (default: 256)
    #[serde(default = "default_max_summary_tokens")]
    pub max_summary_tokens: u32,

    /// Maximum fraction of available budget for the system prompt (default: 0.15)
    #[serde(default = "default_system_prompt_budget")]
    pub system_prompt_budget_ratio: f32,

    /// Maximum fraction of available budget for RAG context (default: 0.15)
    #[serde(default = "default_rag_budget")]
    pub rag_budget_ratio: f32,

    /// Whether output stream repetition detection is enabled (default: true)
    #[serde(default = "default_true")]
    pub repetition_detection_enabled: bool,

    /// N-gram size for repetition detection (default: 3)
    #[serde(default = "default_ngram_size")]
    pub repetition_ngram_size: usize,

    /// Threshold ratio of repeated n-grams to trigger early stream abort (default: 0.3)
    ///
    /// When 30% or more of recent n-grams in the output stream are repeats,
    /// the stream is terminated early to prevent wasting tokens.
    #[serde(default = "default_repetition_threshold")]
    pub repetition_threshold: f32,

    /// Maximum number of tools to send to the LLM per request (default: 8)
    #[serde(default = "default_max_tools")]
    pub max_tools_per_request: usize,

    /// HuggingFace tokenizer model identifier for exact token counting.
    ///
    /// When set, the optimizer loads the specified tokenizer from the
    /// HuggingFace Hub (or a local file path) for exact token counts.
    /// Falls back to heuristic estimation on failure.
    ///
    /// Examples: `"meta-llama/Llama-3.2-3B"`, `"/path/to/tokenizer.json"`
    ///
    /// Requires the `hf-tokenizer` feature (enabled by default).
    #[serde(default)]
    pub tokenizer_model: Option<String>,

    /// Hard cap on the dynamic output token budget (default: None = uncapped).
    ///
    /// When set, the `recommended_max_tokens` produced by the output budget
    /// calculator is clamped to this value. Useful for keeping responses
    /// concise even when the context window would allow larger output.
    #[serde(default)]
    pub output_max_tokens: Option<u32>,

    /// Repetition penalty sent to Ollama as `repeat_penalty` (default: None).
    ///
    /// Values > 1.0 discourage the model from repeating tokens it has already
    /// generated. Ollama's own default is `1.1`. Recommended: `1.1`–`1.3`.
    /// `None` means the Ollama default is used.
    #[serde(default)]
    pub frequency_penalty: Option<f32>,

    /// Presence penalty sent to Ollama as `presence_penalty` (default: None).
    ///
    /// Adds a constant penalty for each unique token that has appeared in the
    /// output so far, promoting topic diversity. Recommended: `0.4`–`0.8`.
    /// `None` means the Ollama default (`0.0`) is used.
    #[serde(default)]
    pub presence_penalty: Option<f32>,

    /// Whether progressive tool compression is enabled (default: true).
    ///
    /// When enabled, tool definitions that the LLM has already seen in the
    /// current conversation have their descriptions stripped on subsequent
    /// turns, reducing token usage while preserving the tool schema.
    #[serde(default = "default_true")]
    pub progressive_tool_compression: bool,

    /// Token pressure threshold for injecting conciseness directives (default: 0.7).
    ///
    /// When the ratio of estimated input tokens to budget exceeds this value,
    /// a brevity instruction is appended to the system prompt. At > 0.9,
    /// an additional structured-format hint is injected.
    #[serde(default = "default_conciseness_threshold")]
    pub conciseness_pressure_threshold: f32,

    /// Maximum tokens for tool result content in historical messages (default: 100).
    ///
    /// Tool results from previous turns are truncated to this budget using
    /// extractive summarization (key JSON fields, priority text lines).
    /// Current-turn tool results are left intact so the LLM can reason
    /// over the full output.
    #[serde(default = "default_tool_result_max_tokens")]
    pub tool_result_max_tokens: u32,

    /// Token budget for conversation history windowing (default: auto-computed).
    ///
    /// When set, the application layer trims conversation history to
    /// fit within this budget before the token-optimization decorator
    /// performs precise compaction. Computed automatically from context
    /// window, system prompt, RAG, and response headroom when `None`.
    #[serde(default)]
    pub max_history_tokens: Option<u32>,

    /// Maximum tokens for agent profile prompt content (default: 300).
    ///
    /// Caps the token budget consumed by agent profile descriptions when
    /// building the system prompt for agentic sub-tasks. Prevents verbose
    /// profile definitions from crowding out other context.
    #[serde(default = "default_max_profile_prompt_tokens")]
    pub max_profile_prompt_tokens: u32,

    /// Directory path for runtime prompt template overrides (default: None).
    ///
    /// When set, the [`TemplateLoader`](crate::prompt::template_loader::TemplateLoader)
    /// attempts to read `{prompt_template_dir}/{name}.prompt.txt` before falling
    /// back to the compiled-in templates bundled via `build.rs`.
    ///
    /// Set to a writable directory to customise individual prompt templates
    /// without recompiling the binary.
    ///
    /// [`TemplateLoader`](crate::prompt::template_loader::TemplateLoader)
    #[serde(default)]
    pub prompt_template_dir: Option<String>,
}

impl TokenOptimizationConfig {
    /// Auto-configure the context window from a model metadata source.
    ///
    /// Queries the given [`ModelInfoPort`](crate::ports::ModelInfoPort) for the model's context window
    /// size and, if successful, updates `context_window_tokens`. The original
    /// value is kept when the query fails (e.g. Ollama is unreachable).
    ///
    /// Returns the detected [`ModelInfo`](crate::profile::ModelInfo) on success for further use
    /// (e.g. hardware profile adjustment).
    ///
    /// # Errors
    ///
    /// Returns [`TokenOptError`](crate::error::TokenOptError) if the port call fails.
    pub async fn auto_detect_context_window(
        &mut self,
        port: &dyn crate::ports::ModelInfoPort,
        model: &str,
    ) -> Result<crate::profile::ModelInfo, crate::error::TokenOptError> {
        let info = port.get_model_info(model).await?;
        tracing::info!(
            model,
            detected = info.context_length,
            previous = self.context_window_tokens,
            "Auto-detected context window from model metadata"
        );
        self.context_window_tokens = info.context_length;
        Ok(info)
    }

    /// Apply hardware-profile-based defaults to this config.
    ///
    /// Uses [`detect_profile`](crate::profile::detect_profile) to determine
    /// the hardware tier, optionally adjusts it based on model info, and
    /// applies the profile's settings where they provide a better fit than
    /// the generic defaults.
    ///
    /// Only adjusts `compaction_trigger_ratio`, `max_tools_per_request`,
    /// and `progressive_tool_compression` — fields that benefit from
    /// hardware-aware tuning.
    pub fn apply_hardware_profile(&mut self, model_info: Option<&crate::profile::ModelInfo>) {
        let base = crate::profile::detect_profile();
        let profile = model_info.map_or(base, |mi| crate::profile::adjust_profile(base, mi));
        let pc = profile.config();

        #[allow(clippy::cast_possible_truncation)]
        {
            self.compaction_trigger_ratio = pc.compaction_threshold as f32;
        }
        self.max_tools_per_request = pc.max_tools;
        self.progressive_tool_compression = pc.progressive_tools;

        tracing::info!(
            ?profile,
            compaction_trigger = self.compaction_trigger_ratio,
            max_tools = self.max_tools_per_request,
            progressive_tools = self.progressive_tool_compression,
            "Applied hardware profile defaults"
        );
    }

    /// Compute the effective history token budget.
    ///
    /// Returns the configured `max_history_tokens` if set, otherwise
    /// derives a budget from the context window minus system prompt,
    /// RAG, and response headroom allocations.
    #[must_use]
    pub fn effective_max_history_tokens(&self) -> u32 {
        if let Some(explicit) = self.max_history_tokens {
            return explicit;
        }
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let available = (f64::from(self.context_window_tokens)
            * f64::from(1.0 - self.response_headroom_ratio)) as u32;
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let reserved = (f64::from(available)
            * f64::from(self.system_prompt_budget_ratio + self.rag_budget_ratio))
            as u32;
        available.saturating_sub(reserved)
    }
}

impl Default for TokenOptimizationConfig {
    fn default() -> Self {
        Self {
            enabled: true,
            context_window_tokens: default_context_window(),
            response_headroom_ratio: default_response_headroom(),
            compaction_trigger_ratio: default_compaction_trigger(),
            max_summary_tokens: default_max_summary_tokens(),
            system_prompt_budget_ratio: default_system_prompt_budget(),
            rag_budget_ratio: default_rag_budget(),
            repetition_detection_enabled: true,
            repetition_ngram_size: default_ngram_size(),
            repetition_threshold: default_repetition_threshold(),
            max_tools_per_request: default_max_tools(),
            tokenizer_model: None,
            output_max_tokens: None,
            frequency_penalty: None,
            presence_penalty: None,
            progressive_tool_compression: true,
            conciseness_pressure_threshold: default_conciseness_threshold(),
            tool_result_max_tokens: default_tool_result_max_tokens(),
            max_history_tokens: None,
            max_profile_prompt_tokens: default_max_profile_prompt_tokens(),
            prompt_template_dir: None,
        }
    }
}

const fn default_true() -> bool {
    true
}
const fn default_context_window() -> u32 {
    8192
}
const fn default_response_headroom() -> f32 {
    0.25
}
const fn default_compaction_trigger() -> f32 {
    0.70
}
const fn default_max_summary_tokens() -> u32 {
    256
}
const fn default_system_prompt_budget() -> f32 {
    0.15
}
const fn default_rag_budget() -> f32 {
    0.15
}
const fn default_ngram_size() -> usize {
    3
}
const fn default_repetition_threshold() -> f32 {
    0.3
}
const fn default_max_tools() -> usize {
    8
}
const fn default_conciseness_threshold() -> f32 {
    0.7
}
const fn default_tool_result_max_tokens() -> u32 {
    100
}
const fn default_max_profile_prompt_tokens() -> u32 {
    300
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn default_config_has_sane_values() {
        let config = TokenOptimizationConfig::default();
        assert!(config.enabled);
        assert_eq!(config.context_window_tokens, 8192);
        assert!((config.response_headroom_ratio - 0.25).abs() < f32::EPSILON);
        assert!((config.compaction_trigger_ratio - 0.70).abs() < f32::EPSILON);
        assert_eq!(config.max_summary_tokens, 256);
        assert!(config.repetition_detection_enabled);
        assert_eq!(config.repetition_ngram_size, 3);
    }

    #[test]
    fn budget_ratios_leave_room_for_history() {
        let config = TokenOptimizationConfig::default();
        let fixed_ratio = config.response_headroom_ratio
            + config.system_prompt_budget_ratio
            + config.rag_budget_ratio;
        // Fixed allocations should leave at least 40% for history
        assert!(fixed_ratio < 0.60, "fixed ratio {fixed_ratio} too high");
    }

    #[test]
    fn deserialization_with_defaults() {
        let toml = r"
            enabled = true
            context_window_tokens = 4096
        ";
        let config: TokenOptimizationConfig = toml::from_str(toml).unwrap();
        assert_eq!(config.context_window_tokens, 4096);
        // Non-specified fields should use defaults
        assert!((config.response_headroom_ratio - 0.25).abs() < f32::EPSILON);
        assert_eq!(config.max_summary_tokens, 256);
        assert!(config.output_max_tokens.is_none());
    }

    #[test]
    fn default_output_max_tokens_is_none() {
        let config = TokenOptimizationConfig::default();
        assert!(config.output_max_tokens.is_none());
    }

    #[test]
    fn output_max_tokens_deserialization() {
        let toml = r"
            enabled = true
            context_window_tokens = 8192
            output_max_tokens = 1024
        ";
        let config: TokenOptimizationConfig = toml::from_str(toml).unwrap();
        assert_eq!(config.output_max_tokens, Some(1024));
    }

    #[test]
    fn sampling_params_deserialization() {
        let toml = r"
            enabled = true
            frequency_penalty = 1.2
            presence_penalty = 0.6
        ";
        let config: TokenOptimizationConfig = toml::from_str(toml).unwrap();
        assert!((config.frequency_penalty.unwrap() - 1.2).abs() < f32::EPSILON);
        assert!((config.presence_penalty.unwrap() - 0.6).abs() < f32::EPSILON);
    }

    #[test]
    fn default_sampling_params_are_none() {
        let config = TokenOptimizationConfig::default();
        assert!(config.frequency_penalty.is_none());
        assert!(config.presence_penalty.is_none());
    }

    #[test]
    fn default_progressive_tool_compression_is_true() {
        let config = TokenOptimizationConfig::default();
        assert!(config.progressive_tool_compression);
    }

    #[test]
    fn progressive_tool_compression_deserialization() {
        let toml = r"
            enabled = true
            progressive_tool_compression = false
        ";
        let config: TokenOptimizationConfig = toml::from_str(toml).unwrap();
        assert!(!config.progressive_tool_compression);
    }
}