collet 0.1.1 - Docs.rs

mod compaction;
#[cfg(test)]
mod tests;

use crate::api::content::Content;
use crate::api::models::Message;

pub(super) const MAX_COMPACTION_LOG: usize = 20;

/// Manages conversation history with token-aware context window.
#[derive(Clone)]
pub struct ConversationContext {
    messages: Vec<Message>,
    system_prompt: String,
    /// Approximate max tokens for the entire context.
    max_context_tokens: usize,
    /// Trigger compaction at this fraction of budget.
    compaction_threshold: f32,
    /// When true, adaptively adjusts compaction_threshold based on conversation characteristics.
    adaptive_compaction: bool,
    /// Last reasoning_content from LLM (preserved thinking).
    last_reasoning: Option<String>,
    /// Compaction history (for status display).
    compaction_log: Vec<CompactionEvent>,
    /// Actual prompt token count from the last API response (None until first call).
    last_actual_prompt_tokens: Option<usize>,
    /// Number of messages present when last_actual_prompt_tokens was recorded.
    /// Used to estimate only the delta for messages added after the last API call.
    messages_at_last_api_call: usize,
    /// Running sum of estimated tokens for all messages (incremental, O(1) per push).
    /// Recomputed from scratch after compaction.
    cached_msg_tokens: usize,
    /// Count of tool/tool_call messages — enables O(1) tool density in adaptive_threshold().
    cached_tool_count: usize,
    /// Pre-built system message content (system_prompt + optional reasoning).
    /// Recomputed whenever system_prompt or last_reasoning changes to avoid
    /// repeated `format!()` calls in the hot path of build_messages().
    cached_system_content: String,
    /// Preserved implementation decisions extracted before compaction.
    /// Never compacted away — injected into every system message.
    decision_log: Vec<String>,
}

/// Record of a compaction event.
#[derive(Debug, Clone)]
pub struct CompactionEvent {
    /// Number of messages before compaction.
    pub before_messages: usize,
    /// Number of messages after compaction.
    pub after_messages: usize,
    /// Estimated tokens before.
    pub before_tokens: usize,
    /// Estimated tokens after.
    pub after_tokens: usize,
    /// Summary generated.
    pub summary_preview: String,
}

impl ConversationContext {
    pub fn new(system_prompt: String) -> Self {
        Self::with_budget(system_prompt, 200_000, 0.8)
    }

    /// Create with custom token budget and compaction threshold.
    pub fn with_budget(
        system_prompt: String,
        max_context_tokens: usize,
        compaction_threshold: f32,
    ) -> Self {
        let cached_system_content = system_prompt.clone();
        Self {
            messages: Vec::new(),
            system_prompt,
            max_context_tokens,
            compaction_threshold,
            adaptive_compaction: true,
            last_reasoning: None,
            compaction_log: Vec::new(),
            last_actual_prompt_tokens: None,
            messages_at_last_api_call: 0,
            cached_msg_tokens: 0,
            cached_tool_count: 0,
            cached_system_content,
            decision_log: Vec::new(),
        }
    }

    /// Build the system content string from prompt + optional reasoning (shared helper).
    fn make_system_content(system_prompt: &str, reasoning: Option<&str>) -> String {
        match reasoning {
            Some(r) => format!(
                "{}\n\n## Previous Reasoning (preserved)\n\n<reasoning>\n{}\n</reasoning>",
                system_prompt, r
            ),
            None => system_prompt.to_string(),
        }
    }

    /// Restore a context from a saved session snapshot using the default budget.
    ///
    /// Used by headless/ACP paths. For TUI sessions use `restore_with_budget`.
    pub fn restore(
        system_prompt: String,
        messages: Vec<Message>,
        last_reasoning: Option<String>,
    ) -> Self {
        let cached_msg_tokens = messages.iter().map(Self::estimate_message_tokens).sum();
        let cached_tool_count = messages
            .iter()
            .filter(|m| m.role == "tool" || m.tool_calls.is_some())
            .count();
        let cached_system_content =
            Self::make_system_content(&system_prompt, last_reasoning.as_deref());
        Self {
            messages,
            system_prompt,
            max_context_tokens: 200_000,
            compaction_threshold: 0.8,
            adaptive_compaction: true,
            last_reasoning,
            compaction_log: Vec::new(),
            last_actual_prompt_tokens: None,
            messages_at_last_api_call: 0,
            cached_msg_tokens,
            cached_tool_count,
            cached_system_content,
            decision_log: Vec::new(),
        }
    }

    /// Restore with custom budget.
    pub fn restore_with_budget(
        system_prompt: String,
        messages: Vec<Message>,
        last_reasoning: Option<String>,
        max_context_tokens: usize,
        compaction_threshold: f32,
    ) -> Self {
        let cached_msg_tokens = messages.iter().map(Self::estimate_message_tokens).sum();
        let cached_tool_count = messages
            .iter()
            .filter(|m| m.role == "tool" || m.tool_calls.is_some())
            .count();
        let cached_system_content =
            Self::make_system_content(&system_prompt, last_reasoning.as_deref());
        Self {
            messages,
            system_prompt,
            max_context_tokens,
            compaction_threshold,
            adaptive_compaction: true,
            last_reasoning,
            compaction_log: Vec::new(),
            last_actual_prompt_tokens: None,
            messages_at_last_api_call: 0,
            cached_msg_tokens,
            cached_tool_count,
            cached_system_content,
            decision_log: Vec::new(),
        }
    }

    /// Get the max context tokens setting.
    pub fn max_context_tokens(&self) -> usize {
        self.max_context_tokens
    }

    /// Get token count currently in context.
    /// Uses actual API-reported count when available, falls back to heuristic estimate.
    pub fn used_tokens(&self) -> usize {
        self.actual_used_tokens()
    }

    /// Update the context with the actual prompt token count returned by the API.
    /// Call immediately after receiving `StreamEvent::Done { prompt_tokens, .. }`.
    pub fn update_actual_tokens(&mut self, prompt_tokens: u32) {
        self.last_actual_prompt_tokens = Some(prompt_tokens as usize);
        self.messages_at_last_api_call = self.messages.len();
    }

    /// Returns the actual prompt token count if available, otherwise falls back to
    /// the heuristic estimate. When actual data is available, only the delta for
    /// messages added since the last API call is estimated and added to the baseline.
    ///
    /// If the message set shrank since the last API call (e.g., compaction ran
    /// without resetting the baseline), the stale baseline is discarded and we
    /// fall back to a fresh estimate. The final value is also lower-bounded by
    /// the current heuristic estimate so large in-flight additions are reflected
    /// in the UI even before the next `Done` event refreshes the baseline.
    fn actual_used_tokens(&self) -> usize {
        let estimate = self.estimate_total_tokens();
        match self.last_actual_prompt_tokens {
            None => estimate,
            Some(base) => {
                if self.messages.len() < self.messages_at_last_api_call {
                    return estimate;
                }
                let new_count = self.messages.len() - self.messages_at_last_api_call;
                let new_tokens: usize = self
                    .messages
                    .iter()
                    .rev()
                    .take(new_count)
                    .map(Self::estimate_message_tokens)
                    .sum();
                (base + new_tokens).max(estimate)
            }
        }
    }

    pub fn system_prompt(&self) -> &str {
        &self.system_prompt
    }

    pub fn push(&mut self, message: Message) {
        if message.role == "tool" || message.tool_calls.is_some() {
            self.cached_tool_count += 1;
        }
        self.cached_msg_tokens += Self::estimate_message_tokens(&message);
        self.messages.push(message);
        self.compact_if_needed();
    }

    /// Push a message and return whether compaction was triggered.
    pub fn push_and_report(&mut self, message: Message) -> bool {
        let before = self.compaction_log.len();
        if message.role == "tool" || message.tool_calls.is_some() {
            self.cached_tool_count += 1;
        }
        self.cached_msg_tokens += Self::estimate_message_tokens(&message);
        self.messages.push(message);
        self.compact_if_needed();
        self.compaction_log.len() > before
    }

    pub fn messages(&self) -> &[Message] {
        &self.messages
    }

    /// Store reasoning content from the last LLM response.
    pub fn set_last_reasoning(&mut self, reasoning: String) {
        self.cached_system_content =
            Self::make_system_content(&self.system_prompt, Some(&reasoning));
        self.last_reasoning = Some(reasoning);
    }

    /// How many compactions have occurred.
    pub fn compaction_count(&self) -> usize {
        self.compaction_log.len()
    }

    /// Enable or disable adaptive compaction threshold adjustment.
    pub fn set_adaptive_compaction(&mut self, enabled: bool) {
        self.adaptive_compaction = enabled;
    }

    /// Get compaction history.
    pub fn compaction_log(&self) -> &[CompactionEvent] {
        &self.compaction_log
    }

    /// Build the full message list including system prompt and preserved reasoning.
    pub fn build_messages(&self) -> Vec<Message> {
        // Inject preserved decisions into the system message so they survive compaction.
        let system_content = if self.decision_log.is_empty() {
            self.cached_system_content.clone()
        } else {
            let entries = self
                .decision_log
                .iter()
                .map(|d| format!("- {d}"))
                .collect::<Vec<_>>()
                .join("\n");
            format!(
                "{}\n\n## Implementation Decisions (preserved — do not re-decide)\n\n{}",
                self.cached_system_content, entries
            )
        };
        let mut msgs = Vec::with_capacity(self.messages.len() + 1);
        msgs.push(Message {
            role: "system".to_string(),
            content: Some(Content::text(&system_content)),
            reasoning_content: None,
            tool_calls: None,
            tool_call_id: None,
        });
        msgs.extend(self.messages.iter().cloned());
        msgs
    }

    /// Record an implementation decision to the persistent log.
    ///
    /// Decisions are injected into the system prompt and never compacted away,
    /// preserving implementation continuity across long tasks.
    pub fn record_decision(&mut self, decision: String) {
        if !self.decision_log.iter().any(|d| d == &decision) {
            self.decision_log.push(decision);
        }
        if self.decision_log.len() > 20 {
            let drain_to = self.decision_log.len() - 20;
            self.decision_log.drain(..drain_to);
        }
    }

    /// Returns the implementation decision log (injected into system prompt after compaction).
    pub fn decision_log(&self) -> &[String] {
        &self.decision_log
    }

    /// Estimate token count for a message (~4 chars per token for code).
    fn estimate_message_tokens(msg: &Message) -> usize {
        let base = match &msg.content {
            Some(Content::Text(s)) => s.len() / 4 + 10,
            Some(Content::Parts(parts)) => {
                let mut total = 0;
                for part in parts {
                    match part {
                        crate::api::ContentPart::Text { text } => {
                            total += text.len() / 4 + 10;
                        }
                        crate::api::ContentPart::ImageUrl { image_url } => {
                            // Vision API 토큰 추정 (OpenAI 기준)
                            // low detail: 85 tokens
                            // high detail: 최대 1105 tokens (2048x2048 기준)
                            total += match image_url.detail.as_deref() {
                                Some("low") => 85,
                                Some("high") => 1105,
                                _ => {
                                    // base64 URL 길이로 이미지 크기 추정
                                    // ~150KB base64 ≈ ~110KB raw → low detail 영역
                                    if image_url.url.len() < 150_000 {
                                        85
                                    } else {
                                        512
                                    }
                                }
                            };
                        }
                    }
                }
                total
            }
            None => 0,
        };

        // tool_calls 등 추가 고려
        base + msg.tool_calls.as_ref().map(|t| t.len() * 20).unwrap_or(0)
    }

    /// Estimate total tokens in the conversation.
    ///
    /// Uses the incrementally-maintained `cached_msg_tokens` (updated on every
    /// push) instead of re-summing the full message list each call.
    pub fn estimate_total_tokens(&self) -> usize {
        let system_tokens = self.system_prompt.len() / 4 + 10;
        system_tokens + self.cached_msg_tokens
    }

    /// Compute adaptive compaction threshold based on conversation characteristics.
    ///
    /// Tool-heavy conversations compact earlier (more tokens per turn),
    /// while discussion-heavy conversations can use more of the window.
    ///
    /// Produces thresholds in the range 0.60–0.85:
    /// - High tool density (>60%): 0.60–0.65 (compact aggressively)
    /// - Moderate tool density (>35%): 0.65–0.70
    /// - Conversational (<15% tools): 0.80–0.85 (keep more context)
    fn adaptive_threshold(&self) -> f32 {
        // If adaptive is disabled, use the configured threshold strictly.
        if !self.adaptive_compaction || self.messages.len() < 6 {
            return self.compaction_threshold;
        }

        // Measure tool density: fraction of messages that are tool calls/results
        let tool_density = self.cached_tool_count as f32 / self.messages.len() as f32;

        // Measure avg turn size in tokens (larger turns = need earlier compaction)
        let avg_msg_tokens = self.cached_msg_tokens as f32 / self.messages.len().max(1) as f32;

        let base = self.compaction_threshold;

        // Adjustment from tool density (primary factor)
        let density_adj: f32 = if tool_density > 0.6 {
            -0.13 // Heavy tool use: compact significantly earlier
        } else if tool_density > 0.35 {
            -0.07 // Moderate tool use
        } else if tool_density > 0.15 {
            0.0 // Mixed: use base as-is
        } else {
            0.05 // Mostly conversational: allow more context
        };

        // Size adjustment: large average message size pushes threshold down
        let size_adj: f32 = if avg_msg_tokens > 500.0 {
            -0.05 // Very large turns (file reads, long tool output)
        } else if avg_msg_tokens > 200.0 {
            -0.02
        } else {
            0.0
        };

        (base + density_adj + size_adj).clamp(0.40, 0.90)
    }

    /// Adaptive compaction target: how low to compress based on conversation type.
    ///
    /// Tool-heavy conversations need more aggressive compression because tool
    /// outputs dominate token usage. Conversational contexts can be lighter.
    fn adaptive_target_ratio(&self) -> f64 {
        let tool_density = if self.messages.is_empty() {
            0.0
        } else {
            self.cached_tool_count as f64 / self.messages.len() as f64
        };

        if tool_density > 0.6 {
            0.20 // Heavy tool use → compress to 20%
        } else if tool_density > 0.35 {
            0.25 // Moderate → 25%
        } else {
            0.30 // Conversational → 30%
        }
    }

    // =====================================================================
    // Multi-Pass Compaction Pipeline
    //
    // Based on research from Microsoft Agent Framework, JetBrains, OpenCode:
    //   Pass 0: Observation masking (replace old tool outputs with placeholders)
    //   Pass 1: SimHash dedup (collapse near-identical tool results)
    //   Pass 2: Smart relevance-aware compaction (IDF scoring, structured summary)
    //   Pass 3: Fallback fixed-window truncation
    //
    // Each pass is progressively more aggressive. Pipeline stops early when
    // the target token budget is reached.
    // =====================================================================

    /// Extract implementation decisions from a message range before they are compacted away.
    ///
    /// Scans assistant messages for decision-patterned sentences and saves them
    /// to `decision_log` so they persist in the system prompt after compaction.
    fn preserve_decisions_from_range(&mut self, start: usize, end: usize) {
        let end = end.min(self.messages.len());
        if start >= end {
            return;
        }
        let mut extracted: Vec<String> = Vec::new();
        for msg in &self.messages[start..end] {
            if msg.role != "assistant" {
                continue;
            }
            let text = match &msg.content {
                Some(c) => c.text_content(),
                None => continue,
            };
            for line in text.lines().take(40) {
                let t = line.trim();
                if t.len() < 25 || t.len() > 400 {
                    continue;
                }
                let is_decision = t.starts_with("I'll ")
                    || t.starts_with("I will ")
                    || t.starts_with("The fix ")
                    || t.starts_with("The approach ")
                    || t.starts_with("Decided to ")
                    || t.starts_with("The solution ")
                    || (t.contains(" because ") && t.len() > 50);
                if is_decision {
                    let entry = crate::util::truncate_bytes(t, 300).to_string();
                    if !self.decision_log.iter().any(|d| d == &entry)
                        && !extracted.iter().any(|d| d == &entry)
                    {
                        extracted.push(entry);
                    }
                }
            }
        }
        self.decision_log.extend(extracted);
        if self.decision_log.len() > 30 {
            let drain_to = self.decision_log.len() - 30;
            self.decision_log.drain(..drain_to);
        }
    }

    pub fn compaction_quality(&self) -> Option<crate::bench::CompactionQuality> {
        let last = self.compaction_log.last()?;

        let token_reduction = if last.before_tokens > 0 {
            last.after_tokens as f32 / last.before_tokens as f32
        } else {
            1.0
        };

        // Count file paths, decisions, errors in current messages
        let mut files_in_context = 0u32;
        let mut decisions_in_context = 0u32;
        let mut errors_in_context = 0u32;

        for msg in &self.messages {
            if let Some(ref content) = msg.content {
                let text = content.text_content();
                files_in_context += text
                    .lines()
                    .filter(|l| l.contains('/') && l.contains('.') && !l.starts_with("http"))
                    .count() as u32;
                decisions_in_context += text
                    .lines()
                    .filter(|l| {
                        let t = l.trim();
                        t.starts_with("I'll ")
                            || t.starts_with("Let me ")
                            || t.starts_with("I need to ")
                            || t.starts_with("Files Modified")
                            || t.starts_with("Key Decisions")
                    })
                    .count() as u32;
                if text.starts_with("Error:") || text.starts_with("error:") {
                    errors_in_context += 1;
                }
                if text.contains("Errors Encountered") {
                    errors_in_context += 1;
                }
            }
        }

        let msg_ratio = if last.before_messages > 0 {
            last.before_messages as f32
        } else {
            1.0
        };

        Some(crate::bench::CompactionQuality {
            preserved_files: (files_in_context as f32 / msg_ratio).min(1.0),
            preserved_decisions: (decisions_in_context as f32 / (msg_ratio * 0.3)).min(1.0),
            preserved_errors: (errors_in_context as f32 / (msg_ratio * 0.1).max(1.0)).min(1.0),
            token_reduction,
        })
    }

    /// Remaining token budget estimate (logged once per agent loop iteration).
    pub fn budget_remaining(&self) -> usize {
        self.max_context_tokens
            .saturating_sub(self.estimate_total_tokens())
    }

    /// Preemptively run threshold-based compaction without pushing a new message.
    /// Returns `true` if compaction actually ran (i.e., a new entry was appended
    /// to the compaction log). Safe to call before building an API request —
    /// it only acts when the configured threshold is crossed.
    pub fn maybe_compact(&mut self) -> bool {
        let before = self.compaction_log.len();
        self.compact_if_needed();
        self.compaction_log.len() > before
    }

    /// Update the system prompt (e.g., after repo map rebuild).
    pub fn update_system_prompt(&mut self, new_prompt: String) {
        self.system_prompt = new_prompt;
        self.cached_system_content =
            Self::make_system_content(&self.system_prompt, self.last_reasoning.as_deref());
    }

    /// Append content to the system prompt (e.g., skill metadata).
    pub fn append_system_prompt(&mut self, content: &str) {
        self.system_prompt.push_str("\n\n");
        self.system_prompt.push_str(content);
        self.cached_system_content =
            Self::make_system_content(&self.system_prompt, self.last_reasoning.as_deref());
    }

    /// Clear all messages and reset token tracking (keep system prompt).
    pub fn clear(&mut self) {
        self.messages.clear();
        self.cached_msg_tokens = 0;
        self.cached_tool_count = 0;
        self.last_actual_prompt_tokens = None;
        self.messages_at_last_api_call = 0;
        self.last_reasoning = None;
        self.cached_system_content = self.system_prompt.clone();
    }

    /// Force compaction regardless of threshold.
    ///
    /// Uses the same multi-pass pipeline as auto-compaction but bypasses
    /// the threshold check. Targets 25% of max context.
    pub fn force_compact(&mut self) {
        // Need at least system prompt + 1 user/assistant pair to compact
        if self.messages.len() <= 2 {
            return;
        }

        tracing::info!(
            "Force compaction requested: ~{} messages",
            self.messages.len()
        );

        let target_ratio = self.adaptive_target_ratio();
        let target_tokens = (self.max_context_tokens as f64 * target_ratio) as usize;

        self.run_compaction_pipeline(target_tokens);
    }
}