ai_tokenopt 0.5.8

//! Token optimization orchestrator
//!
//! Composes all optimization components — budget allocation, history
//! compaction, system prompt optimization, and tool compression — into
//! a single API surface used by the decorator.

use tracing::{debug, info, instrument, warn};

use crate::ports::SummarizationPort;
use crate::types::{Conversation, ToolDefinition};

use crate::budget::{BudgetAllocation, TokenBudget};
use crate::config::TokenOptimizationConfig;
use crate::error::TokenOptError;
use crate::estimator::{ConversationTokenEstimate, TokenEstimator};
use crate::history::compactor::{CompactionResult, HistoryCompactor};
use crate::prompt::system_prompt::{PromptContext, inject_conciseness, optimize_system_prompt};
use crate::stream::repetition::RepetitionDetector;
use crate::tools::schema_compressor::compress_tool_definitions;
use crate::tools::selector::select_tools;

/// A planned optimization step with estimated token savings.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct OptimizationStep {
    /// Human-readable name of the optimization strategy.
    pub name: &'static str,
    /// Estimated tokens that would be saved by applying this step.
    pub estimated_savings: u32,
}

/// An ordered list of optimization steps, sorted by descending savings impact.
///
/// Produced by [`OptimizationPlan::build`] and consumed by the
/// pipeline inside `optimize_conversation`. Logged at DEBUG level so operators
/// can see which strategies were chosen and why.
#[derive(Debug, Default)]
pub struct OptimizationPlan {
    /// Steps ordered from highest to lowest estimated savings.
    pub steps: Vec<OptimizationStep>,
}

impl OptimizationPlan {
    /// Build a plan from the current conversation state.
    ///
    /// Estimates savings for each lightweight pre-compaction strategy:
    /// - Cross-turn RAG deduplication
    /// - Old tool result compression
    /// - System prompt trimming
    ///
    /// Returns steps sorted by descending `estimated_savings`.
    #[must_use]
    pub fn build(
        conversation: &crate::types::Conversation,
        allocation: &BudgetAllocation,
        tool_result_max_tokens: u32,
    ) -> Self {
        let mut steps = Vec::new();

        // Estimate savings from cross-turn RAG dedup: count RAG blocks in messages
        // older than 5 turns; each stripped block saves roughly its token count.
        let total_msgs = conversation.messages.len();
        let decay_cutoff = total_msgs.saturating_sub(10); // 5 turns × 2 msgs
        let rag_savings: u32 = conversation.messages[..decay_cutoff]
            .iter()
            .filter(|m| {
                m.content.contains("YOUR MEMORY:") || m.content.contains("YOUR KNOWLEDGE GRAPH:")
            })
            .map(|m| TokenEstimator::estimate_tokens(&m.content))
            .sum();
        if rag_savings > 0 {
            steps.push(OptimizationStep {
                name: "rag_cross_turn_dedup",
                estimated_savings: rag_savings,
            });
        }

        // Estimate savings from old tool result compression
        let last_user_idx = conversation
            .messages
            .iter()
            .rposition(|m| m.role == crate::types::MessageRole::User)
            .unwrap_or(conversation.messages.len());
        let tool_savings: u32 = conversation.messages[..last_user_idx]
            .iter()
            .filter(|m| m.role == crate::types::MessageRole::Tool)
            .map(|m| {
                let tokens = TokenEstimator::estimate_tokens(&m.content);
                tokens.saturating_sub(tool_result_max_tokens.min(tokens))
            })
            .sum();
        if tool_savings > 0 {
            steps.push(OptimizationStep {
                name: "tool_result_compression",
                estimated_savings: tool_savings,
            });
        }

        // Estimate savings from system prompt trimming: how much over budget?
        if let Some(ref prompt) = conversation.system_prompt {
            let prompt_tokens = TokenEstimator::estimate_tokens(prompt);
            let prompt_savings = prompt_tokens.saturating_sub(allocation.system_prompt);
            if prompt_savings > 0 {
                steps.push(OptimizationStep {
                    name: "system_prompt_trim",
                    estimated_savings: prompt_savings,
                });
            }
        }

        // Sort descending by savings
        steps.sort_by(|a, b| b.estimated_savings.cmp(&a.estimated_savings));

        Self { steps }
    }

    /// Total estimated savings across all steps.
    #[must_use]
    pub fn total_estimated_savings(&self) -> u32 {
        self.steps.iter().map(|s| s.estimated_savings).sum()
    }
}

/// Result of optimizing a conversation's input.
#[derive(Debug)]
pub struct OptimizationResult {
    /// Token budget allocation used
    pub budget: BudgetAllocation,
    /// Token estimate before optimization
    pub estimate_before: ConversationTokenEstimate,
    /// Token estimate after optimization
    pub estimate_after: ConversationTokenEstimate,
    /// History compaction result (if compaction occurred)
    pub compaction: Option<CompactionResult>,
    /// Whether the system prompt was trimmed
    pub system_prompt_trimmed: bool,
    /// Recommended max_tokens for the LLM response (v2 output budgeting).
    /// `None` when v2 is not enabled or output budgeting is skipped.
    pub recommended_max_tokens: Option<u32>,
    /// The optimization plan that was computed and applied.
    pub plan: OptimizationPlan,
}

/// Central orchestrator composing all token optimization components.
#[derive(Debug)]
pub struct TokenOptimizer {
    config: TokenOptimizationConfig,
    budget: TokenBudget,
    compactor: HistoryCompactor,
    /// Per-model estimation correction.
    calibrator: Option<std::sync::RwLock<crate::estimator_tuning::EstimationCalibrator>>,
    /// Prometheus-compatible metrics.
    metrics: Option<std::sync::Arc<crate::metrics::OptimizationMetrics>>,
    /// HuggingFace tokenizer for exact token counts.
    #[cfg(feature = "hf-tokenizer")]
    hf_estimator: Option<std::sync::Arc<crate::estimator_hf::HfTokenEstimator>>,
}

impl TokenOptimizer {
    /// Create a new optimizer from configuration.
    #[must_use]
    pub fn new(config: TokenOptimizationConfig) -> Self {
        let budget = TokenBudget::new(&config);
        let compactor = HistoryCompactor::new(config.max_summary_tokens);

        #[cfg(feature = "hf-tokenizer")]
        let hf_estimator = init_hf_estimator(&config);

        Self {
            config,
            budget,
            compactor,
            calibrator: None,
            metrics: None,
            #[cfg(feature = "hf-tokenizer")]
            hf_estimator,
        }
    }

    /// Enable per-model estimation calibration.
    ///
    /// When enabled, calling [`report_actual_tokens`](Self::report_actual_tokens)
    /// will feed the calibrator, improving future estimates.
    #[must_use]
    pub fn with_calibration(mut self) -> Self {
        self.calibrator = Some(std::sync::RwLock::new(
            crate::estimator_tuning::EstimationCalibrator::new(),
        ));
        self
    }

    /// Attach shared metrics for observability.
    #[must_use]
    pub fn with_metrics(
        mut self,
        metrics: std::sync::Arc<crate::metrics::OptimizationMetrics>,
    ) -> Self {
        self.metrics = Some(metrics);
        self
    }

    /// Attach a pre-loaded HuggingFace tokenizer for exact token counting.
    ///
    /// When set, the optimizer uses the real tokenizer for budget decisions
    /// instead of the heuristic character-ratio estimator.
    #[cfg(feature = "hf-tokenizer")]
    #[must_use]
    pub fn with_hf_tokenizer(mut self, estimator: crate::estimator_hf::HfTokenEstimator) -> Self {
        self.hf_estimator = Some(std::sync::Arc::new(estimator));
        self
    }

    /// Estimate conversation tokens using the best available backend.
    ///
    /// Uses the HuggingFace tokenizer when available, otherwise falls
    /// back to the heuristic estimator.
    fn estimate_conversation(&self, conversation: &Conversation) -> ConversationTokenEstimate {
        #[cfg(feature = "hf-tokenizer")]
        if let Some(ref hf) = self.hf_estimator {
            return hf.count_conversation_tokens(conversation);
        }
        TokenEstimator::estimate_conversation(conversation)
    }

    /// Estimate tokens for a text string using the best available backend.
    fn estimate_tokens(&self, text: &str) -> u32 {
        #[cfg(feature = "hf-tokenizer")]
        if let Some(ref hf) = self.hf_estimator {
            return hf.count_tokens(text);
        }
        TokenEstimator::estimate_tokens(text)
    }

    /// Estimate tokens for tool definitions using the best available backend.
    fn estimate_tool_definitions(&self, tools: &[ToolDefinition]) -> u32 {
        #[cfg(feature = "hf-tokenizer")]
        if let Some(ref hf) = self.hf_estimator {
            return hf.count_tool_definitions_tokens(tools);
        }
        TokenEstimator::estimate_tool_definitions(tools)
    }

    /// Clamp a dynamic output token budget by `config.output_max_tokens`.
    #[cfg(test)]
    fn cap_output_tokens(&self, tokens: u32) -> u32 {
        self.config
            .output_max_tokens
            .map_or(tokens, |cap| tokens.min(cap))
    }

    /// Report observed token counts from the LLM for calibration.
    ///
    /// Thread-safe: uses interior mutability so this works through `Arc`.
    /// Does nothing when calibration is not enabled.
    pub fn report_actual_tokens(&self, model: &str, estimated: u32, actual: u32) {
        if let Some(ref cal) = self.calibrator {
            if let Ok(mut guard) = cal.write() {
                guard.record_observation(model, estimated, actual);
            }
        }
    }

    /// Whether this optimizer is enabled.
    #[must_use]
    pub fn is_enabled(&self) -> bool {
        self.config.enabled
    }

    /// Access the optimizer's configuration.
    #[must_use]
    pub fn config(&self) -> &TokenOptimizationConfig {
        &self.config
    }

    /// Optimize a conversation's input to fit within the token budget.
    ///
    /// This is the main entry point for input optimization. It:
    /// 1. Estimates current token usage and budget allocation
    /// 2. Builds an impact-ordered `OptimizationPlan` — lighter strategies first
    /// 3. Applies lightweight optimizations (RAG dedup, tool compression, prompt trim)
    /// 4. Re-estimates; only escalates to history compaction if still over budget
    ///
    /// The conversation is modified in-place. On any error, the original
    /// conversation is left unchanged (best-effort).
    #[allow(clippy::too_many_lines)]
    #[instrument(skip(self, conversation, inference), fields(
        msgs = conversation.messages.len(),
        enabled = self.config.enabled,
    ))]
    pub async fn optimize_conversation(
        &self,
        conversation: &mut Conversation,
        inference: Option<&dyn SummarizationPort>,
    ) -> Result<OptimizationResult, TokenOptError> {
        let estimate_before = self.estimate_conversation(conversation);

        debug!(
            system_prompt = estimate_before.system_prompt,
            summary = estimate_before.summary,
            history = estimate_before.history,
            total = estimate_before.total,
            "Token estimate before optimization"
        );

        // Compute initial budget allocation (no RAG or tools in this path)
        let allocation = self
            .budget
            .allocate_adaptive(&estimate_before, false, false, 0);

        // Build impact-ordered plan BEFORE applying any optimization
        let plan = OptimizationPlan::build(
            conversation,
            &allocation,
            self.config.tool_result_max_tokens,
        );

        if !plan.steps.is_empty() {
            debug!(
                steps = ?plan.steps.iter().map(|s| format!("{}(~{}t)", s.name, s.estimated_savings)).collect::<Vec<_>>(),
                total_estimated_savings = plan.total_estimated_savings(),
                "Impact-ordered optimization plan"
            );
        }

        // Apply lightweight optimizations in impact order.
        // RAG dedup and tool compression are applied unconditionally (they never
        // increase tokens); system prompt trimming happens in the prompt phase.
        let current_turn = conversation.messages.len() / 2;
        let rag_dedup_saved = crate::prompt::rag_cross_turn_dedup::deduplicate_rag_across_turns(
            &mut conversation.messages,
            current_turn,
            None,
        );
        if rag_dedup_saved > 0 {
            debug!(
                tokens_saved = rag_dedup_saved,
                "Cross-turn RAG dedup applied"
            );
        }

        let tool_result_saved = crate::tools::result_truncator::compress_old_tool_results(
            &mut conversation.messages,
            self.config.tool_result_max_tokens,
        );
        if tool_result_saved > 0 {
            debug!(
                tokens_saved = tool_result_saved,
                "Old tool results compressed"
            );
        }

        // Re-estimate after lightweight passes; update allocation pressure
        let estimate_after_lightweight = self.estimate_conversation(conversation);
        let allocation =
            self.budget
                .allocate_adaptive(&estimate_after_lightweight, false, false, 0);

        // Compact history if still needed after lightweight passes
        let compaction = if allocation.requires_compaction {
            debug!(
                history_budget = allocation.history,
                history_actual =
                    estimate_after_lightweight.history + estimate_after_lightweight.summary,
                pressure = allocation.pressure,
                "History compaction triggered"
            );
            match self
                .compactor
                .compact(conversation, &allocation, inference)
                .await
            {
                Ok(result) => {
                    info!(
                        strategy = ?result.strategy,
                        messages_removed = result.messages_removed,
                        tokens_saved = result.tokens_saved,
                        "History compacted"
                    );
                    Some(result)
                },
                Err(e) => {
                    warn!(error = %e, "History compaction failed, continuing without");
                    None
                },
            }
        } else {
            None
        };

        // Optimize system prompt if present and over budget
        let system_prompt_trimmed = if let Some(ref prompt) = conversation.system_prompt {
            // Use cached token count if available (standalone mode only)
            #[cfg(not(feature = "pisovereign"))]
            let prompt_tokens = conversation
                .cached_prompt_tokens
                .unwrap_or_else(|| self.estimate_tokens(prompt));
            #[cfg(feature = "pisovereign")]
            let prompt_tokens = self.estimate_tokens(prompt);

            if prompt_tokens > allocation.system_prompt {
                let context = PromptContext::new(
                    false,
                    prompt.contains("Context:") || prompt.contains("context:"),
                );
                let optimized = optimize_system_prompt(prompt, allocation.system_prompt, &context);

                // Cache the optimized prompt token count (standalone mode)
                #[cfg(not(feature = "pisovereign"))]
                {
                    conversation.cached_prompt_tokens = Some(self.estimate_tokens(&optimized));
                }

                conversation.system_prompt = Some(optimized);
                true
            } else {
                // Cache the un-trimmed count for next turn (standalone mode)
                #[cfg(not(feature = "pisovereign"))]
                if conversation.cached_prompt_tokens.is_none() {
                    conversation.cached_prompt_tokens = Some(prompt_tokens);
                }

                false
            }
        } else {
            false
        };

        // Inject conciseness directives under pressure (only when compaction
        // was needed AND the system prompt was trimmed, to avoid net token increase)
        if system_prompt_trimmed {
            if let Some(ref prompt) = conversation.system_prompt {
                let concise = inject_conciseness(
                    prompt,
                    f64::from(allocation.pressure),
                    self.config.conciseness_pressure_threshold,
                );
                if concise.len() != prompt.len() {
                    conversation.system_prompt = Some(concise);
                }
            }
        }

        let estimate_after = self.estimate_conversation(conversation);

        if estimate_before.total > 0 {
            let saved = estimate_before.total.saturating_sub(estimate_after.total);
            let reduction_pct = (f64::from(saved) / f64::from(estimate_before.total)) * 100.0;
            info!(
                before = estimate_before.total,
                after = estimate_after.total,
                saved,
                reduction_pct = format_args!("{reduction_pct:.1}%"),
                "Conversation optimized"
            );
        }

        // Output budget: let the model generate without artificial limits.
        // Only `output_max_tokens` (if explicitly configured) acts as a hard cap.
        let recommended_max_tokens: Option<u32> = self.config.output_max_tokens;

        // Record metrics
        if let Some(ref metrics) = self.metrics {
            let strategy_name = compaction.as_ref().map_or("none", |c| match c.strategy {
                crate::history::compactor::CompactionStrategy::None => "none",
                crate::history::compactor::CompactionStrategy::Deduplication => "dedup",
                crate::history::compactor::CompactionStrategy::Lossless => "lossless",
                crate::history::compactor::CompactionStrategy::Extractive => "extractive",
                crate::history::compactor::CompactionStrategy::Paraphrasing => "paraphrasing",
                crate::history::compactor::CompactionStrategy::LlmFallback => "llm",
            });
            metrics.record_optimization(estimate_before.total, estimate_after.total, strategy_name);
            if recommended_max_tokens.is_some() {
                metrics.record_output_cap();
            }
        }

        Ok(OptimizationResult {
            budget: allocation,
            estimate_before,
            estimate_after,
            compaction,
            system_prompt_trimmed,
            recommended_max_tokens,
            plan,
        })
    }

    /// Optimize a conversation that will include tool definitions.
    ///
    /// Same as `optimize_conversation` but accounts for tool definition
    /// tokens in the budget allocation.
    #[allow(clippy::too_many_lines)]
    #[instrument(skip(self, conversation, tools, inference), fields(
        msgs = conversation.messages.len(),
        tools = tools.len(),
    ))]
    pub async fn optimize_conversation_with_tools(
        &self,
        conversation: &mut Conversation,
        tools: &[ToolDefinition],
        inference: Option<&dyn SummarizationPort>,
    ) -> Result<OptimizationResult, TokenOptError> {
        let estimate_before = self.estimate_conversation(conversation);

        // Compute initial budget allocation (tools present)
        let allocation = self
            .budget
            .allocate_adaptive(&estimate_before, false, true, tools.len());

        // Build impact-ordered plan BEFORE applying any optimization
        let plan = OptimizationPlan::build(
            conversation,
            &allocation,
            self.config.tool_result_max_tokens,
        );

        // Apply lightweight optimizations (impact-ordered)
        let current_turn = conversation.messages.len() / 2;
        crate::prompt::rag_cross_turn_dedup::deduplicate_rag_across_turns(
            &mut conversation.messages,
            current_turn,
            None,
        );

        crate::tools::result_truncator::compress_old_tool_results(
            &mut conversation.messages,
            self.config.tool_result_max_tokens,
        );

        // Re-estimate after lightweight passes; update allocation pressure
        let estimate_after_lightweight = self.estimate_conversation(conversation);
        let allocation =
            self.budget
                .allocate_adaptive(&estimate_after_lightweight, false, true, tools.len());

        // Compact history if still needed after lightweight passes
        let compaction = if allocation.requires_compaction {
            match self
                .compactor
                .compact(conversation, &allocation, inference)
                .await
            {
                Ok(result) => {
                    info!(
                        strategy = ?result.strategy,
                        messages_removed = result.messages_removed,
                        tokens_saved = result.tokens_saved,
                        "History compacted (tool-calling context)"
                    );
                    Some(result)
                },
                Err(e) => {
                    warn!(error = %e, "History compaction failed, continuing without");
                    None
                },
            }
        } else {
            None
        };

        // Optimize system prompt with tool context
        let system_prompt_trimmed = if let Some(ref prompt) = conversation.system_prompt {
            #[cfg(not(feature = "pisovereign"))]
            let prompt_tokens = conversation
                .cached_prompt_tokens
                .unwrap_or_else(|| self.estimate_tokens(prompt));
            #[cfg(feature = "pisovereign")]
            let prompt_tokens = self.estimate_tokens(prompt);

            if prompt_tokens > allocation.system_prompt {
                let context = PromptContext::new(
                    true,
                    prompt.contains("Context:") || prompt.contains("context:"),
                );
                let optimized = optimize_system_prompt(prompt, allocation.system_prompt, &context);

                #[cfg(not(feature = "pisovereign"))]
                {
                    conversation.cached_prompt_tokens = Some(self.estimate_tokens(&optimized));
                }

                conversation.system_prompt = Some(optimized);
                true
            } else {
                #[cfg(not(feature = "pisovereign"))]
                if conversation.cached_prompt_tokens.is_none() {
                    conversation.cached_prompt_tokens = Some(prompt_tokens);
                }

                false
            }
        } else {
            false
        };

        // Inject conciseness directives under pressure
        if system_prompt_trimmed {
            if let Some(ref prompt) = conversation.system_prompt {
                let concise = inject_conciseness(
                    prompt,
                    f64::from(allocation.pressure),
                    self.config.conciseness_pressure_threshold,
                );
                if concise.len() != prompt.len() {
                    conversation.system_prompt = Some(concise);
                }
            }
        }

        let estimate_after = self.estimate_conversation(conversation);

        // Output budget: let the model generate without artificial limits.
        // Only `output_max_tokens` (if explicitly configured) acts as a hard cap.
        let recommended_max_tokens: Option<u32> = self.config.output_max_tokens;

        // Record metrics
        if let Some(ref metrics) = self.metrics {
            metrics.record_optimization(estimate_before.total, estimate_after.total, "tools");
        }

        Ok(OptimizationResult {
            budget: allocation,
            estimate_before,
            estimate_after,
            compaction,
            system_prompt_trimmed,
            recommended_max_tokens,
            plan,
        })
    }

    /// Select and compress tool definitions for a query.
    ///
    /// Returns the optimized set of tools ready to send to the LLM.
    #[must_use]
    pub fn optimize_tools(&self, query: &str, tools: &[ToolDefinition]) -> Vec<ToolDefinition> {
        if tools.is_empty() {
            return Vec::new();
        }

        // Select relevant tools
        let selected = select_tools(query, tools, self.config.max_tools_per_request);

        let before_count = tools.len();
        let after_count = selected.len();
        let before_tokens = self.estimate_tool_definitions(tools);
        let after_tokens = self.estimate_tool_definitions(&selected);

        // Compress selected tool schemas
        let compressed = compress_tool_definitions(&selected);
        let compressed_tokens = self.estimate_tool_definitions(&compressed);

        debug!(
            before_count,
            after_count, before_tokens, after_tokens, compressed_tokens, "Tools optimized"
        );

        compressed
    }

    /// Optimize and progressively compress tools using a usage tracker.
    ///
    /// Same as [`optimize_tools`](Self::optimize_tools) but applies additional
    /// progressive compression: tools seen in previous turns get their
    /// descriptions stripped, keeping only name + parameter types.
    #[must_use]
    pub fn optimize_tools_progressive(
        &self,
        query: &str,
        tools: &[ToolDefinition],
        tracker: &crate::tools::progressive::ToolUsageTracker,
    ) -> Vec<ToolDefinition> {
        let base = self.optimize_tools(query, tools);
        crate::tools::progressive::compress_progressively(&base, tracker)
    }

    /// Simple text-in / text-out prompt optimization.
    ///
    /// Wraps the input text as a single-user-message conversation, runs
    /// the full optimization pipeline, and returns an [`OptimizedPrompt`](crate::types::OptimizedPrompt)
    /// with recommended output budget.
    ///
    /// # Errors
    ///
    /// Returns [`TokenOptError`] if optimization fails.
    pub async fn optimize_prompt(
        &self,
        text: &str,
        inference: Option<&dyn SummarizationPort>,
    ) -> Result<crate::types::OptimizedPrompt, TokenOptError> {
        let tokens_before = self.estimate_tokens(text);

        let mut conv = Conversation::new();
        conv.add_user_message(text);

        let result = self.optimize_conversation(&mut conv, inference).await?;

        let optimized_text = conv
            .messages
            .first()
            .map_or_else(String::new, |m| m.content.clone());
        let tokens_after = self.estimate_tokens(&optimized_text);

        let complexity = crate::output::complexity::classify_query(text);

        #[allow(clippy::cast_precision_loss)]
        let reduction_ratio = if tokens_before > 0 {
            (tokens_before - tokens_after) as f32 / tokens_before as f32
        } else {
            0.0
        };

        let mut strategies = Vec::new();
        if result.system_prompt_trimmed {
            strategies.push("system_prompt_trim".to_string());
        }
        if result.compaction.is_some() {
            strategies.push("history_compaction".to_string());
        }

        Ok(crate::types::OptimizedPrompt {
            text: optimized_text,
            recommended_max_tokens: self.config.output_max_tokens,
            tokens_estimated: tokens_after,
            strategies_applied: strategies,
            metadata: crate::types::OptimizationMetadata {
                tokens_before,
                tokens_after,
                reduction_ratio,
                complexity: Some(complexity),
            },
        })
    }

    /// Create a configured repetition detector for monitoring output streams.
    #[must_use]
    pub fn create_stream_monitor(&self) -> Option<RepetitionDetector> {
        if self.config.repetition_detection_enabled {
            Some(RepetitionDetector::new(
                self.config.repetition_ngram_size,
                self.config.repetition_threshold,
            ))
        } else {
            None
        }
    }
}

/// Try to initialize an HF tokenizer from config.
///
/// Attempts local file first, then HuggingFace Hub download.
/// Returns `None` on any failure (the optimizer falls back to heuristic).
#[cfg(feature = "hf-tokenizer")]
fn init_hf_estimator(
    config: &TokenOptimizationConfig,
) -> Option<std::sync::Arc<crate::estimator_hf::HfTokenEstimator>> {
    use std::path::Path;

    let model = config.tokenizer_model.as_deref()?;

    // Try as local file first
    let path = Path::new(model);
    if path.exists() {
        match crate::estimator_hf::HfTokenEstimator::from_file(path) {
            Ok(est) => {
                info!(path = model, "Using local HuggingFace tokenizer");
                return Some(std::sync::Arc::new(est));
            },
            Err(e) => {
                warn!(error = %e, path = model, "Failed to load local tokenizer");
            },
        }
    }

    // Try as HuggingFace Hub model identifier
    match crate::estimator_hf::HfTokenEstimator::from_pretrained(model) {
        Ok(est) => {
            info!(model, "Initialized HuggingFace tokenizer from Hub");
            Some(std::sync::Arc::new(est))
        },
        Err(e) => {
            warn!(
                error = %e,
                model,
                "Failed to initialize HuggingFace tokenizer, using heuristic estimation"
            );
            None
        },
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn default_config() -> TokenOptimizationConfig {
        TokenOptimizationConfig::default()
    }

    #[test]
    fn optimizer_creates_with_defaults() {
        let optimizer = TokenOptimizer::new(default_config());
        assert!(optimizer.is_enabled());
    }

    #[test]
    fn disabled_optimizer_reports_disabled() {
        let mut config = default_config();
        config.enabled = false;
        let optimizer = TokenOptimizer::new(config);
        assert!(!optimizer.is_enabled());
    }

    #[tokio::test]
    async fn optimize_small_conversation_no_compaction() {
        let optimizer = TokenOptimizer::new(default_config());
        let mut conv = Conversation::new();
        conv.add_user_message("Hello");
        conv.add_assistant_message("Hi there!");

        let result = optimizer.optimize_conversation(&mut conv, None).await;
        assert!(result.is_ok());
        let result = result.expect("optimization should succeed");
        assert!(result.compaction.is_none());
        assert!(!result.system_prompt_trimmed);
    }

    #[tokio::test]
    async fn optimize_large_conversation_triggers_compaction() {
        let optimizer = TokenOptimizer::new(default_config());
        let mut conv = Conversation::with_system_prompt("You are a helpful assistant.");

        // Add many messages to exceed the compaction threshold
        for i in 0..60 {
            conv.add_user_message(format!(
                "Message number {i} with a significant amount of content designed to take \
                 up token space and push us over the budget threshold. This message includes \
                 additional context about the topic, references to previous conversations, \
                 and detailed questions that require substantial processing. Furthermore, \
                 each message contributes to the overall token count which should eventually \
                 exceed the configured compaction trigger ratio of the context window."
            ));
            conv.add_assistant_message(format!(
                "Response number {i} with a comprehensive and detailed answer about the \
                 topic including various facts, figures, explanations, and recommendations. \
                 This response is intentionally verbose to ensure that the conversation \
                 history grows large enough to trigger the compaction mechanism. It covers \
                 multiple aspects of the question and provides thorough analysis with \
                 supporting evidence, examples, and actionable next steps for the user."
            ));
        }

        let result = optimizer.optimize_conversation(&mut conv, None).await;
        assert!(result.is_ok());
        let result = result.expect("optimization should succeed");
        assert!(result.compaction.is_some());
        assert!(result.estimate_after.total < result.estimate_before.total);
    }

    #[test]
    fn optimize_tools_reduces_count() {
        let optimizer = TokenOptimizer::new(default_config());
        let tools: Vec<ToolDefinition> = (0..15)
            .map(|i| ToolDefinition {
                name: format!("tool_{i}"),
                description: format!("This tool does something related to task {i}"),
                parameters: crate::types::ToolParameters {
                    schema_type: "object".to_string(),
                    properties: std::collections::HashMap::new(),
                    required: Vec::new(),
                },
                icon: None,
            })
            .collect();

        let selected = optimizer.optimize_tools("I need tool_1 for my task", &tools);
        assert!(selected.len() <= 8); // max_tools_per_request default
    }

    #[test]
    fn create_stream_monitor_when_enabled() {
        let optimizer = TokenOptimizer::new(default_config());
        assert!(optimizer.create_stream_monitor().is_some());
    }

    #[test]
    fn no_stream_monitor_when_disabled() {
        let mut config = default_config();
        config.repetition_detection_enabled = false;
        let optimizer = TokenOptimizer::new(config);
        assert!(optimizer.create_stream_monitor().is_none());
    }

    #[test]
    fn optimize_empty_tools_returns_empty() {
        let optimizer = TokenOptimizer::new(default_config());
        let result = optimizer.optimize_tools("query", &[]);
        assert!(result.is_empty());
    }

    #[test]
    fn cap_output_tokens_no_cap() {
        let optimizer = TokenOptimizer::new(default_config());
        assert_eq!(optimizer.cap_output_tokens(2048), 2048);
    }

    #[test]
    fn cap_output_tokens_within_cap() {
        let mut config = default_config();
        config.output_max_tokens = Some(1024);
        let optimizer = TokenOptimizer::new(config);
        assert_eq!(optimizer.cap_output_tokens(512), 512);
    }

    #[test]
    fn cap_output_tokens_exceeds_cap() {
        let mut config = default_config();
        config.output_max_tokens = Some(1024);
        let optimizer = TokenOptimizer::new(config);
        assert_eq!(optimizer.cap_output_tokens(2048), 1024);
    }

    #[tokio::test]
    async fn optimize_conversation_returns_no_max_tokens_by_default() {
        let optimizer = TokenOptimizer::new(default_config());
        let mut conv = Conversation::new();
        conv.add_user_message("What is the weather today?");
        conv.add_assistant_message("It's sunny.");
        conv.add_user_message("Thanks!");

        let result = optimizer
            .optimize_conversation(&mut conv, None)
            .await
            .expect("optimization should succeed");
        // No output budget when output_max_tokens is unset
        assert!(result.recommended_max_tokens.is_none());
    }

    #[tokio::test]
    async fn optimize_conversation_caps_output_tokens() {
        let mut config = default_config();
        config.output_max_tokens = Some(64);
        let optimizer = TokenOptimizer::new(config);
        let mut conv = Conversation::new();
        conv.add_user_message("Explain quantum mechanics in detail please.");

        let result = optimizer
            .optimize_conversation(&mut conv, None)
            .await
            .expect("optimization should succeed");
        assert_eq!(result.recommended_max_tokens, Some(64));
    }

    #[test]
    fn optimization_plan_empty_for_clean_conversation() {
        let conv = Conversation::new();
        let allocation = BudgetAllocation {
            system_prompt: 512,
            rag_context: 1024,
            tool_definitions: 512,
            history: 4096,
            response_headroom: 1024,
            requires_compaction: false,
            pressure: 0.1,
        };
        let plan = OptimizationPlan::build(&conv, &allocation, 100);
        assert!(
            plan.steps.is_empty(),
            "empty conversation should produce no optimization steps"
        );
        assert_eq!(plan.total_estimated_savings(), 0);
    }

    #[test]
    fn optimization_plan_total_estimated_savings_sums_steps() {
        let plan = OptimizationPlan {
            steps: vec![
                OptimizationStep {
                    name: "step_a",
                    estimated_savings: 200,
                },
                OptimizationStep {
                    name: "step_b",
                    estimated_savings: 300,
                },
            ],
        };
        assert_eq!(plan.total_estimated_savings(), 500);
    }

    /// Helper identical to the one in `result_truncator` tests: handles the
    /// `ChatMessage::tool()` signature difference between standalone and pisovereign builds.
    #[cfg(not(feature = "pisovereign"))]
    fn make_tool_msg(content: impl Into<String>) -> crate::types::ChatMessage {
        crate::types::ChatMessage::tool(content)
    }

    #[cfg(feature = "pisovereign")]
    fn make_tool_msg(content: impl Into<String>) -> crate::types::ChatMessage {
        crate::types::ChatMessage::tool("tool_call_id", content)
    }

    #[test]
    fn optimization_plan_sorted_descending_by_savings() {
        let mut conv = Conversation::with_system_prompt(
            // A system prompt longer than its budget → triggers `system_prompt_trim`
            "A".repeat(4096),
        );
        // Add tool messages before the last user message so tool compression is estimated.
        for _ in 0..5 {
            conv.messages.push(make_tool_msg("T".repeat(2048)));
        }
        // Final user message (tool messages are before this)
        conv.add_user_message("query");

        let allocation = BudgetAllocation {
            // System prompt budget much smaller than actual prompt → large savings estimate
            system_prompt: 10,
            rag_context: 1024,
            tool_definitions: 512,
            history: 4096,
            response_headroom: 1024,
            requires_compaction: true,
            pressure: 0.95,
        };
        let plan = OptimizationPlan::build(&conv, &allocation, 10);

        // Steps must be in descending order of estimated_savings
        let savings: Vec<u32> = plan.steps.iter().map(|s| s.estimated_savings).collect();
        let mut sorted = savings.clone();
        sorted.sort_by(|a, b| b.cmp(a));
        assert_eq!(
            savings, sorted,
            "plan steps must be sorted descending by savings"
        );
    }

    #[tokio::test]
    async fn optimize_conversation_result_includes_plan() {
        let optimizer = TokenOptimizer::new(default_config());
        let mut conv = Conversation::new();
        conv.add_user_message("What is 2+2?");
        conv.add_assistant_message("4");
        conv.add_user_message("Thanks");

        let result = optimizer
            .optimize_conversation(&mut conv, None)
            .await
            .expect("optimization should succeed");
        // Plan is always present, even if empty
        let _ = result.plan.total_estimated_savings(); // just verify field is accessible
    }
}