ai_tokenopt 0.5.10

//! Conversation history compactor
//!
//! Reduces conversation token usage through a three-tier adaptive strategy:
//! 1. **Lossless**: whitespace cleanup, no information lost
//! 2. **Extractive**: heuristic sentence extraction from pruned messages
//! 3. **LLM fallback**: Ollama-based summarization under extreme pressure

use crate::ports::SummarizationPort;
use crate::types::{ChatMessage, Conversation, MessageRole};

use crate::budget::BudgetAllocation;
use crate::error::TokenOptError;
use crate::estimator::TokenEstimator;
use crate::history::summarizer::ExtractiveSummarizer;

/// Strategy used during compaction.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompactionStrategy {
    /// No compaction needed — within budget
    None,
    /// Removed near-duplicate adjacent messages
    Deduplication,
    /// Removed redundant whitespace only
    Lossless,
    /// Extracted key sentences from pruned messages
    Extractive,
    /// LLM-paraphrased old verbose assistant messages
    Paraphrasing,
    /// Used the LLM to generate a summary of pruned messages
    LlmFallback,
}

/// Result of a compaction operation.
#[derive(Debug, Clone)]
pub struct CompactionResult {
    /// Number of messages removed from history
    pub messages_removed: usize,
    /// Estimated tokens saved
    pub tokens_saved: u32,
    /// Whether a summary was generated
    pub summary_generated: bool,
    /// Which compaction strategy was applied
    pub strategy: CompactionStrategy,
}

/// History compactor that progressively reduces token usage.
#[derive(Debug)]
pub struct HistoryCompactor {
    max_summary_tokens: u32,
    /// When `true`, skip LLM-based compaction phases (paraphrasing and
    /// LLM fallback summarization) to avoid blocking the critical inference
    /// path. Heuristic phases (dedup, lossless, extractive) still run.
    skip_llm_phases: bool,
}

impl HistoryCompactor {
    /// Create a new compactor with the given summary token budget.
    #[must_use]
    pub const fn new(max_summary_tokens: u32) -> Self {
        Self {
            max_summary_tokens,
            skip_llm_phases: false,
        }
    }

    /// Create a compactor that skips expensive LLM phases.
    ///
    /// Use this on the critical chat path where blocking for 1-5s on
    /// LLM summarization is unacceptable. Heuristic compaction is still
    /// applied; LLM refinement can run in a background job later.
    #[must_use]
    pub const fn new_fast(max_summary_tokens: u32) -> Self {
        Self {
            max_summary_tokens,
            skip_llm_phases: true,
        }
    }

    /// Compact conversation history to fit within the token budget.
    ///
    /// The compaction is applied in-place to the conversation. On success,
    /// the conversation's `summary` field may be updated with a rolling
    /// summary of pruned messages.
    ///
    /// # Arguments
    ///
    /// * `conversation` — Mutable reference to the conversation to compact
    /// * `budget` — The token budget allocation computed by `TokenBudget`
    /// * `inference` — Optional inference port for LLM-based summarization
    #[allow(clippy::too_many_lines)]
    pub async fn compact(
        &self,
        conversation: &mut Conversation,
        budget: &BudgetAllocation,
        inference: Option<&dyn SummarizationPort>,
    ) -> Result<CompactionResult, TokenOptError> {
        let initial_tokens = TokenEstimator::estimate_messages(&conversation.messages);

        // Fast path: already within budget
        if !budget.requires_compaction {
            return Ok(CompactionResult {
                messages_removed: 0,
                tokens_saved: 0,
                summary_generated: false,
                strategy: CompactionStrategy::None,
            });
        }

        // Phase 0: Deduplicate near-identical adjacent messages
        {
            let dedup_result =
                crate::history::dedup::deduplicate_adjacent(&conversation.messages, 0.7);
            if dedup_result.merged_count > 0 {
                conversation.messages = dedup_result.messages;
                let after_dedup = TokenEstimator::estimate_messages(&conversation.messages);
                if after_dedup <= budget.history {
                    return Ok(CompactionResult {
                        messages_removed: dedup_result.merged_count,
                        tokens_saved: initial_tokens.saturating_sub(after_dedup),
                        summary_generated: false,
                        strategy: CompactionStrategy::Deduplication,
                    });
                }
            }
        }

        // Phase 0b: Collapse sequential tool call/result chains
        {
            let collapse_result =
                crate::tools::chain_collapser::collapse_tool_chains(&conversation.messages);
            if collapse_result.collapsed_count > 0 {
                conversation.messages = collapse_result.messages;
                let after_collapse = TokenEstimator::estimate_messages(&conversation.messages);
                if after_collapse <= budget.history {
                    return Ok(CompactionResult {
                        messages_removed: collapse_result.collapsed_count,
                        tokens_saved: initial_tokens.saturating_sub(after_collapse),
                        summary_generated: false,
                        strategy: CompactionStrategy::Lossless,
                    });
                }
            }
        }

        // Phase 1: Lossless — trim whitespace from all messages
        for msg in &mut conversation.messages {
            let trimmed = collapse_whitespace(&msg.content);
            if trimmed.len() < msg.content.len() {
                msg.content = trimmed;
            }
        }

        let after_lossless = TokenEstimator::estimate_messages(&conversation.messages);
        if after_lossless <= budget.history {
            return Ok(CompactionResult {
                messages_removed: 0,
                tokens_saved: initial_tokens.saturating_sub(after_lossless),
                summary_generated: false,
                strategy: CompactionStrategy::Lossless,
            });
        }

        // Phase 2: Extractive — prune least-relevant non-system messages
        // and generate a summary from them.
        //
        // Extract the last user message as the relevance query so that
        // messages semantically related to the current question survive.
        let query: String = conversation
            .messages
            .iter()
            .rev()
            .find(|m| m.role == MessageRole::User)
            .map_or_else(String::new, |m| m.content.clone());
        let (pruned_messages, messages_removed) =
            prune_by_relevance(conversation, budget.history, &query);

        if !pruned_messages.is_empty() {
            let summary =
                ExtractiveSummarizer::summarize(&pruned_messages, self.max_summary_tokens);
            if !summary.is_empty() {
                // Merge with existing summary if present
                let new_summary = if let Some(existing) = &conversation.summary {
                    format!("{existing} | {summary}")
                } else {
                    summary
                };
                // Compact merged summary (dedup segments, optionally LLM re-summarize)
                let compacted = crate::history::summary_compactor::compact_summary(
                    &new_summary,
                    self.max_summary_tokens,
                    inference,
                )
                .await;
                conversation.summary = Some(truncate_summary(&compacted, self.max_summary_tokens));
            }
        }

        let after_extractive = TokenEstimator::estimate_messages(&conversation.messages);
        if after_extractive <= budget.history {
            return Ok(CompactionResult {
                messages_removed,
                tokens_saved: initial_tokens.saturating_sub(after_extractive),
                summary_generated: conversation.summary.is_some(),
                strategy: CompactionStrategy::Extractive,
            });
        }

        // Phase 2b: LLM paraphrasing of old verbose assistant messages
        // Skipped in fast mode to avoid blocking the inference critical path.
        if !self.skip_llm_phases {
            if let Some(port) = inference {
                let pressure = f64::from(after_extractive) / f64::from(budget.history).max(1.0);
                let current_turn = conversation.messages.len() / 2;
                let (para_count, para_saved) =
                    crate::history::paraphraser::paraphrase_old_messages(
                        &mut conversation.messages,
                        pressure,
                        current_turn,
                        port,
                    )
                    .await;
                if para_count > 0 {
                    let after_para = TokenEstimator::estimate_messages(&conversation.messages);
                    if after_para <= budget.history {
                        return Ok(CompactionResult {
                            messages_removed,
                            tokens_saved: initial_tokens.saturating_sub(after_para).max(para_saved),
                            summary_generated: conversation.summary.is_some(),
                            strategy: CompactionStrategy::Paraphrasing,
                        });
                    }
                }
            }
        }

        // Phase 3: LLM fallback — use the inference port to generate a better summary
        // Skipped in fast mode to avoid blocking the inference critical path.
        if !self.skip_llm_phases {
            if let Some(port) = inference {
                if let Ok(result) = self.llm_summarize(port, &pruned_messages).await {
                    let new_summary = if let Some(existing) = &conversation.summary {
                        format!("{existing} | {result}")
                    } else {
                        result
                    };
                    let compacted = crate::history::summary_compactor::compact_summary(
                        &new_summary,
                        self.max_summary_tokens,
                        Some(port),
                    )
                    .await;
                    conversation.summary =
                        Some(truncate_summary(&compacted, self.max_summary_tokens));

                    let after_llm = TokenEstimator::estimate_messages(&conversation.messages);
                    return Ok(CompactionResult {
                        messages_removed,
                        tokens_saved: initial_tokens.saturating_sub(after_llm),
                        summary_generated: true,
                        strategy: CompactionStrategy::LlmFallback,
                    });
                }
                // LLM failed — fall through with extractive result
            }
        }

        let final_tokens = TokenEstimator::estimate_messages(&conversation.messages);
        Ok(CompactionResult {
            messages_removed,
            tokens_saved: initial_tokens.saturating_sub(final_tokens),
            summary_generated: conversation.summary.is_some(),
            strategy: CompactionStrategy::Extractive,
        })
    }

    /// Use the LLM to generate a compact summary of messages.
    async fn llm_summarize(
        &self,
        inference: &dyn SummarizationPort,
        messages: &[ChatMessage],
    ) -> Result<String, TokenOptError> {
        let content: String = messages
            .iter()
            .map(|m| format!("{}: {}", role_label(m.role), m.content))
            .collect::<Vec<_>>()
            .join("\n");

        let prompt = format!(
            "Summarize this conversation excerpt in 2-3 sentences. \
             Preserve key facts, decisions, and context. Be concise.\n\n{content}"
        );

        inference
            .summarize(
                "You are a precise summarizer. Output only the summary, nothing else.",
                &prompt,
            )
            .await
    }
}

/// Collapse runs of whitespace into single spaces and trim.
fn collapse_whitespace(text: &str) -> String {
    let mut result = String::with_capacity(text.len());
    let mut prev_was_space = false;

    for ch in text.chars() {
        if ch.is_whitespace() {
            if !prev_was_space {
                result.push(' ');
                prev_was_space = true;
            }
        } else {
            result.push(ch);
            prev_was_space = false;
        }
    }

    result.trim().to_string()
}

/// Remove the least-relevant non-system messages from a conversation.
///
/// Uses BM25 relevance scoring against the current `query` to decide
/// which messages to keep. Always preserves:
/// - All system messages
/// - The last `PRESERVE_RECENT_TURNS` user+assistant turn pairs
///   (conversational coherence guarantee)
///
/// Among older messages, the highest-scoring ones are retained first,
/// making the pruning relevance-aware rather than pure FIFO.
///
/// Returns the removed messages and the count of messages removed.
fn prune_by_relevance(
    conversation: &mut Conversation,
    history_budget: u32,
    query: &str,
) -> (Vec<ChatMessage>, usize) {
    const PRESERVE_RECENT_TURNS: usize = 5;
    let preserve_messages = PRESERVE_RECENT_TURNS * 2;

    let messages = std::mem::take(&mut conversation.messages);

    // Separate system messages from others
    let (system_msgs, non_system_msgs): (Vec<_>, Vec<_>) = messages
        .into_iter()
        .partition(|m| m.role == MessageRole::System);

    let total = non_system_msgs.len();
    if total <= preserve_messages {
        // Nothing to prune — everything is recent
        conversation.messages = system_msgs;
        conversation.messages.extend(non_system_msgs);
        return (Vec::new(), 0);
    }

    // Split into older (candidates for pruning) and recent (always kept)
    let split = total.saturating_sub(preserve_messages);
    let older = &non_system_msgs[..split];
    let recent = &non_system_msgs[split..];

    // Score the older messages by relevance to the current query
    let scores = crate::history::relevance::score_messages(query, older);

    // Determine how many older messages we can afford to keep.
    // Start by keeping ALL recent messages and compute remaining budget.
    let recent_tokens = TokenEstimator::estimate_messages(recent);
    let system_tokens = TokenEstimator::estimate_messages(&system_msgs);
    let remaining_budget = history_budget.saturating_sub(recent_tokens + system_tokens);

    // Greedily keep the highest-scoring older messages that fit
    let mut kept_older_indices: Vec<usize> = Vec::new();
    let mut used_tokens = 0u32;
    for &(idx, _score) in &scores {
        let msg_tokens = TokenEstimator::estimate_tokens(&older[idx].content);
        if used_tokens + msg_tokens <= remaining_budget {
            kept_older_indices.push(idx);
            used_tokens += msg_tokens;
        }
    }
    // Re-sort by original position to maintain conversation order
    kept_older_indices.sort_unstable();

    let kept_set: std::collections::HashSet<usize> = kept_older_indices.iter().copied().collect();
    let mut pruned: Vec<ChatMessage> = older
        .iter()
        .enumerate()
        .filter(|(i, _)| !kept_set.contains(i))
        .map(|(_, m)| m.clone())
        .collect();
    let messages_removed = pruned.len();

    // Reconstruct: system + kept older (in order) + recent
    conversation.messages = system_msgs;
    for idx in &kept_older_indices {
        conversation.messages.push(older[*idx].clone());
    }
    conversation.messages.extend(recent.iter().cloned());

    // If still over budget with recent-only, aggressive fallback
    let kept_tokens = TokenEstimator::estimate_messages(&conversation.messages);
    if kept_tokens > history_budget && conversation.messages.len() > 2 {
        let all_msgs = std::mem::take(&mut conversation.messages);
        let (sys, non_sys): (Vec<_>, Vec<_>) = all_msgs
            .into_iter()
            .partition(|m| m.role == MessageRole::System);
        let keep = non_sys.len().min(2);
        let extra_pruned = non_sys.len().saturating_sub(keep);
        let (extra_removed, kept): (Vec<_>, Vec<_>) = non_sys
            .into_iter()
            .enumerate()
            .partition(|(i, _)| *i < extra_pruned);
        pruned.extend(extra_removed.into_iter().map(|(_, m)| m));
        conversation.messages = sys;
        conversation
            .messages
            .extend(kept.into_iter().map(|(_, m)| m));
        return (pruned, messages_removed + extra_pruned);
    }

    (pruned, messages_removed)
}

/// Truncate a summary string to fit within a token budget.
fn truncate_summary(summary: &str, max_tokens: u32) -> String {
    let current_tokens = TokenEstimator::estimate_tokens(summary);
    if current_tokens <= max_tokens {
        return summary.to_string();
    }

    // Approximate character limit
    let max_chars = (max_tokens as usize) * 4;
    let truncated: String = summary.chars().take(max_chars.saturating_sub(3)).collect();
    format!("{truncated}...")
}

/// Human-readable role label for summary text.
const fn role_label(role: MessageRole) -> &'static str {
    match role {
        MessageRole::User => "User",
        MessageRole::Assistant => "Assistant",
        MessageRole::System => "System",
        MessageRole::Tool => "Tool",
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn collapse_whitespace_removes_extra_spaces() {
        assert_eq!(collapse_whitespace("  hello   world  "), "hello world");
    }

    #[test]
    fn collapse_whitespace_handles_newlines() {
        assert_eq!(collapse_whitespace("hello\n\n  world"), "hello world");
    }

    #[test]
    fn truncate_summary_within_budget_unchanged() {
        let text = "Short summary.";
        assert_eq!(truncate_summary(text, 100), text);
    }

    #[test]
    fn truncate_summary_over_budget() {
        let text = "A".repeat(2000);
        let result = truncate_summary(&text, 10);
        // 10 tokens * 4 chars ≈ 40 chars + "..."
        assert!(result.len() < 50);
        assert!(result.ends_with("..."));
    }

    #[test]
    fn prune_preserves_system_messages() {
        let mut conv = Conversation::with_system_prompt("System prompt");
        for i in 0..20 {
            conv.add_user_message(format!("Question {i}"));
            conv.add_assistant_message(format!("Answer {i}"));
        }

        let system_count_before = conv
            .messages
            .iter()
            .filter(|m| m.role == MessageRole::System)
            .count();

        let (_, _) = prune_by_relevance(&mut conv, 500, "Question");

        let system_count_after = conv
            .messages
            .iter()
            .filter(|m| m.role == MessageRole::System)
            .count();

        // System messages should be preserved (there are 0 because system_prompt
        // is stored separately, not as a message)
        assert_eq!(system_count_before, system_count_after);
    }

    #[test]
    fn prune_keeps_recent_messages() {
        let mut conv = Conversation::new();
        for i in 0..20 {
            conv.add_user_message(format!("Question {i} with some extra padding text"));
            conv.add_assistant_message(format!("Answer {i} with a longer response body here"));
        }

        // Budget low enough to force pruning of older messages
        let (pruned, removed) = prune_by_relevance(&mut conv, 100, "Question 19");
        // Should have pruned some older messages
        assert!(removed > 0);
        assert_eq!(pruned.len(), removed);

        // Last message should still be "Answer 19..."
        let last = conv.messages.last().unwrap();
        assert!(last.content.starts_with("Answer 19"));
    }
}