ai_tokenopt 0.5.5

//! Token budget allocation engine
//!
//! Dynamically distributes the available context window across competing
//! concerns: system prompt, RAG context, tool definitions, conversation
//! history, and response headroom. Flags when compaction is needed.

use crate::config::TokenOptimizationConfig;
use crate::estimator::ConversationTokenEstimate;

/// Dynamic token budget allocation across context window sections.
#[derive(Debug, Clone, PartialEq)]
pub struct BudgetAllocation {
    /// Maximum tokens for the system prompt
    pub system_prompt: u32,
    /// Maximum tokens for RAG-injected context
    pub rag_context: u32,
    /// Maximum tokens for tool definitions
    pub tool_definitions: u32,
    /// Maximum tokens for conversation history (messages + summary)
    pub history: u32,
    /// Tokens reserved for the LLM response
    pub response_headroom: u32,
    /// Whether history compaction is required to fit within budget
    pub requires_compaction: bool,
    /// Current pressure level (0.0 = empty, 1.0 = at capacity)
    pub pressure: f32,
}

/// Estimated per-tool token overhead for definition schema.
const TOKENS_PER_TOOL_ESTIMATE: u32 = 150;

/// Token budget calculator.
#[derive(Debug)]
pub struct TokenBudget {
    context_window: u32,
    response_headroom_ratio: f32,
    compaction_trigger_ratio: f32,
    system_prompt_budget_ratio: f32,
    rag_budget_ratio: f32,
}

impl TokenBudget {
    /// Create a new budget calculator from configuration.
    #[must_use]
    pub fn new(config: &TokenOptimizationConfig) -> Self {
        Self {
            context_window: config.context_window_tokens,
            response_headroom_ratio: config.response_headroom_ratio,
            compaction_trigger_ratio: config.compaction_trigger_ratio,
            system_prompt_budget_ratio: config.system_prompt_budget_ratio,
            rag_budget_ratio: config.rag_budget_ratio,
        }
    }

    /// Compute a dynamic budget allocation based on actual content sizes.
    ///
    /// The algorithm:
    /// 1. Reserve response headroom (fixed fraction of context window)
    /// 2. Allocate system prompt — min(actual, budget cap)
    /// 3. Allocate RAG context — min(actual, budget cap)
    /// 4. Allocate tool definitions if tools are present
    /// 5. Remaining budget goes to conversation history
    /// 6. If history exceeds remaining budget → flag compaction required
    #[must_use]
    pub fn allocate(
        &self,
        estimate: &ConversationTokenEstimate,
        has_tools: bool,
        tool_count: usize,
    ) -> BudgetAllocation {
        // 1. Reserve response headroom
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let response_headroom =
            (f64::from(self.context_window) * f64::from(self.response_headroom_ratio)) as u32;
        let available = self.context_window.saturating_sub(response_headroom);

        // 2. System prompt: take the minimum of actual size and budget cap
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let system_prompt_cap =
            (f64::from(available) * f64::from(self.system_prompt_budget_ratio)) as u32;
        let system_prompt = estimate.system_prompt.min(system_prompt_cap);

        // 3. RAG context budget
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let rag_cap = (f64::from(available) * f64::from(self.rag_budget_ratio)) as u32;
        // RAG tokens are part of system_prompt estimate in the current pipeline,
        // so this cap is for the optimizer to enforce during RAG dedup.
        let rag_context = rag_cap;

        // 4. Tool definitions (if present)
        #[allow(clippy::cast_possible_truncation)]
        let tool_definitions = if has_tools {
            (tool_count as u32) * TOKENS_PER_TOOL_ESTIMATE
        } else {
            0
        };

        // 5. Remaining budget for history
        let used_by_fixed = system_prompt + tool_definitions;
        let history_budget = available.saturating_sub(used_by_fixed);

        // 6. Check if compaction is needed
        let history_actual = estimate.history + estimate.summary;
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let compaction_threshold =
            (f64::from(history_budget) * f64::from(self.compaction_trigger_ratio)) as u32;
        let requires_compaction = history_actual > compaction_threshold && history_actual > 0;

        // Overall pressure: how full is the context window
        let total_used = estimate.total + tool_definitions;
        #[allow(clippy::cast_possible_truncation)]
        let pressure = if available > 0 {
            (f64::from(total_used) / f64::from(available)) as f32
        } else {
            1.0
        };

        BudgetAllocation {
            system_prompt,
            rag_context,
            tool_definitions,
            history: history_budget,
            response_headroom,
            requires_compaction,
            pressure: pressure.min(1.0),
        }
    }

    /// Allocate budget with pressure-aware priority ordering.
    ///
    /// When the context window is under pressure (> 90% full), this method
    /// rebalances budgets using a fixed priority order:
    ///
    /// 1. **History** is cut first (lowest priority)
    /// 2. **Tool definitions** are reduced second
    /// 3. **System prompt** is protected (highest priority)
    ///
    /// Below 90% pressure, this behaves identically to [`allocate`](Self::allocate).
    #[must_use]
    pub fn allocate_with_pressure_priority(
        &self,
        estimate: &ConversationTokenEstimate,
        has_tools: bool,
        tool_count: usize,
    ) -> BudgetAllocation {
        let base = self.allocate(estimate, has_tools, tool_count);

        // Only apply pressure rebalancing above 90% pressure
        if base.pressure < 0.9 {
            return base;
        }

        // Under high pressure: guarantee system prompt gets its full cap,
        // then tools get what they need (capped), then history gets the rest.
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let response_headroom =
            (f64::from(self.context_window) * f64::from(self.response_headroom_ratio)) as u32;
        let available = self.context_window.saturating_sub(response_headroom);

        // System prompt: protected — gets full cap
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let system_prompt_cap =
            (f64::from(available) * f64::from(self.system_prompt_budget_ratio)) as u32;
        let system_prompt = estimate.system_prompt.min(system_prompt_cap);

        // RAG context: same cap
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let rag_cap = (f64::from(available) * f64::from(self.rag_budget_ratio)) as u32;
        let rag_context = rag_cap;

        // Tools: capped at half the remaining space (leave room for history)
        let after_system = available.saturating_sub(system_prompt);
        #[allow(clippy::cast_possible_truncation)]
        let tool_estimate = if has_tools {
            (tool_count as u32) * TOKENS_PER_TOOL_ESTIMATE
        } else {
            0
        };
        let tool_definitions = tool_estimate.min(after_system / 2);

        // History: whatever is left
        let history_budget = after_system.saturating_sub(tool_definitions);
        let history_actual = estimate.history + estimate.summary;
        let requires_compaction = history_actual > history_budget && history_actual > 0;

        BudgetAllocation {
            system_prompt,
            rag_context,
            tool_definitions,
            history: history_budget,
            response_headroom,
            requires_compaction,
            pressure: base.pressure,
        }
    }

    /// Allocate budget with presence-aware redistribution.
    ///
    /// Unlike [`allocate`](Self::allocate), this method detects which
    /// components are absent and redistributes their budget:
    ///
    /// - **No RAG**: RAG budget ratio is added to the system prompt cap,
    ///   giving long system prompts more room before trimming.
    /// - **No tools**: tool budget is already 0 and goes to history.
    /// - **Small system prompt**: unused cap space goes to history.
    ///
    /// Priority order: history (highest) > system prompt > RAG > tools.
    #[must_use]
    pub fn allocate_adaptive(
        &self,
        estimate: &ConversationTokenEstimate,
        has_rag: bool,
        has_tools: bool,
        tool_count: usize,
    ) -> BudgetAllocation {
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let response_headroom =
            (f64::from(self.context_window) * f64::from(self.response_headroom_ratio)) as u32;
        let available = self.context_window.saturating_sub(response_headroom);

        // Redistribute absent-component budgets
        let (effective_system_ratio, effective_rag_ratio) = if has_rag {
            (self.system_prompt_budget_ratio, self.rag_budget_ratio)
        } else {
            // No RAG: give its budget to the system prompt cap
            (
                self.system_prompt_budget_ratio + self.rag_budget_ratio,
                0.0_f32,
            )
        };

        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let system_prompt_cap = (f64::from(available) * f64::from(effective_system_ratio)) as u32;
        let system_prompt = estimate.system_prompt.min(system_prompt_cap);

        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let rag_context = (f64::from(available) * f64::from(effective_rag_ratio)) as u32;

        #[allow(clippy::cast_possible_truncation)]
        let tool_definitions = if has_tools {
            (tool_count as u32) * TOKENS_PER_TOOL_ESTIMATE
        } else {
            0
        };

        // History gets everything not consumed by system prompt and tools
        let used_by_fixed = system_prompt + tool_definitions;
        let history_budget = available.saturating_sub(used_by_fixed);

        let history_actual = estimate.history + estimate.summary;
        #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
        let compaction_threshold =
            (f64::from(history_budget) * f64::from(self.compaction_trigger_ratio)) as u32;
        let requires_compaction = history_actual > compaction_threshold && history_actual > 0;

        let total_used = estimate.total + tool_definitions;
        #[allow(clippy::cast_possible_truncation)]
        let pressure = if available > 0 {
            (f64::from(total_used) / f64::from(available)) as f32
        } else {
            1.0
        };

        BudgetAllocation {
            system_prompt,
            rag_context,
            tool_definitions,
            history: history_budget,
            response_headroom,
            requires_compaction,
            pressure: pressure.min(1.0),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::config::TokenOptimizationConfig;

    fn default_budget() -> TokenBudget {
        TokenBudget::new(&TokenOptimizationConfig::default())
    }

    #[test]
    fn empty_conversation_no_compaction() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 0,
            summary: 0,
            history: 0,
            total: 0,
        };
        let alloc = budget.allocate(&estimate, false, 0);
        assert!(!alloc.requires_compaction);
        assert!(alloc.pressure < f32::EPSILON);
    }

    #[test]
    fn response_headroom_always_reserved() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 100,
            summary: 0,
            history: 6000,
            total: 6100,
        };
        let alloc = budget.allocate(&estimate, false, 0);
        // 8192 * 0.25 = 2048
        assert_eq!(alloc.response_headroom, 2048);
    }

    #[test]
    fn heavy_history_triggers_compaction() {
        let budget = default_budget();
        // Available = 8192 - 2048 = 6144
        // System prompt cap = 6144 * 0.15 = 921
        // System prompt actual = 100 → 100
        // History budget = 6144 - 100 = 6044
        // Compaction threshold = 6044 * 0.70 = 4230
        // History actual 5000 > 4230 → compaction required
        let estimate = ConversationTokenEstimate {
            system_prompt: 100,
            summary: 0,
            history: 5000,
            total: 5100,
        };
        let alloc = budget.allocate(&estimate, false, 0);
        assert!(alloc.requires_compaction);
    }

    #[test]
    fn tools_reduce_history_budget() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 100,
            summary: 0,
            history: 2000,
            total: 2100,
        };
        let without_tools = budget.allocate(&estimate, false, 0);
        let with_tools = budget.allocate(&estimate, true, 5);
        assert!(with_tools.history < without_tools.history);
        assert_eq!(with_tools.tool_definitions, 5 * TOKENS_PER_TOOL_ESTIMATE);
    }

    #[test]
    fn pressure_increases_with_usage() {
        let budget = default_budget();
        let low = ConversationTokenEstimate {
            system_prompt: 50,
            summary: 0,
            history: 200,
            total: 250,
        };
        let high = ConversationTokenEstimate {
            system_prompt: 200,
            summary: 100,
            history: 5000,
            total: 5300,
        };
        let low_alloc = budget.allocate(&low, false, 0);
        let high_alloc = budget.allocate(&high, false, 0);
        assert!(high_alloc.pressure > low_alloc.pressure);
    }

    #[test]
    fn system_prompt_capped_at_budget_ratio() {
        let budget = default_budget();
        // Available = 6144, cap = 6144 * 0.15 = 921
        let estimate = ConversationTokenEstimate {
            system_prompt: 2000, // Way over cap
            summary: 0,
            history: 100,
            total: 2100,
        };
        let alloc = budget.allocate(&estimate, false, 0);
        // System prompt should be capped at 921
        assert!(alloc.system_prompt <= 922); // Allow 1 for rounding
        assert!(alloc.system_prompt < 2000);
    }

    #[test]
    fn pressure_priority_low_pressure_same_as_base() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 100,
            summary: 0,
            history: 500,
            total: 600,
        };
        let base = budget.allocate(&estimate, true, 3);
        let priority = budget.allocate_with_pressure_priority(&estimate, true, 3);
        assert_eq!(base, priority);
    }

    #[test]
    fn pressure_priority_system_prompt_protected() {
        let budget = default_budget();
        // Push pressure above 0.9: available = 6144, total+tools >> 6144
        let estimate = ConversationTokenEstimate {
            system_prompt: 800,
            summary: 200,
            history: 5000,
            total: 6000,
        };
        let alloc = budget.allocate_with_pressure_priority(&estimate, true, 10);
        // System prompt should still get its full allocation (≤ cap)
        assert!(alloc.system_prompt >= 800);
        // History is most constrained
        assert!(alloc.requires_compaction);
    }

    #[test]
    fn pressure_priority_tools_capped_under_pressure() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 500,
            summary: 500,
            history: 5000,
            total: 6000,
        };
        let base = budget.allocate(&estimate, true, 20);
        let priority = budget.allocate_with_pressure_priority(&estimate, true, 20);
        // Under pressure, tools should be capped more aggressively
        assert!(priority.tool_definitions <= base.tool_definitions);
    }

    // ========================================================================
    // allocate_adaptive tests
    // ========================================================================

    #[test]
    fn adaptive_no_rag_boosts_system_prompt_cap() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 1500, // Exceeds default 15% cap (921) but fits 30% cap (1843)
            summary: 0,
            history: 500,
            total: 2000,
        };
        let with_rag = budget.allocate_adaptive(&estimate, true, false, 0);
        let without_rag = budget.allocate_adaptive(&estimate, false, false, 0);

        // Without RAG, system prompt cap is 15% + 15% = 30%
        // So more of the system prompt survives
        assert!(
            without_rag.system_prompt > with_rag.system_prompt,
            "Without RAG, system prompt cap should be higher: {} vs {}",
            without_rag.system_prompt,
            with_rag.system_prompt
        );
    }

    #[test]
    fn adaptive_no_rag_rag_context_is_zero() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 100,
            summary: 0,
            history: 500,
            total: 600,
        };
        let alloc = budget.allocate_adaptive(&estimate, false, false, 0);
        assert_eq!(alloc.rag_context, 0);
    }

    #[test]
    fn adaptive_with_rag_preserves_rag_budget() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 100,
            summary: 0,
            history: 500,
            total: 600,
        };
        let alloc = budget.allocate_adaptive(&estimate, true, false, 0);
        assert!(alloc.rag_context > 0);
        // Should equal the standard 15% cap
        assert_eq!(
            alloc.rag_context,
            budget.allocate(&estimate, false, 0).rag_context
        );
    }

    #[test]
    fn adaptive_no_rag_gives_more_history() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 100,
            summary: 0,
            history: 3000,
            total: 3100,
        };
        let with_rag = budget.allocate_adaptive(&estimate, true, false, 0);
        let without_rag = budget.allocate_adaptive(&estimate, false, false, 0);

        // History should be the same here because system_prompt actual is
        // below both caps (15% and 30%). History = available - system_prompt.
        assert_eq!(without_rag.history, with_rag.history);
    }

    #[test]
    fn adaptive_matches_allocate_when_rag_present_no_tools() {
        let budget = default_budget();
        let estimate = ConversationTokenEstimate {
            system_prompt: 100,
            summary: 0,
            history: 500,
            total: 600,
        };
        let adaptive = budget.allocate_adaptive(&estimate, true, false, 0);
        let base = budget.allocate(&estimate, false, 0);
        // When RAG is present and no tools, adaptive should match base
        assert_eq!(adaptive.system_prompt, base.system_prompt);
        assert_eq!(adaptive.rag_context, base.rag_context);
        assert_eq!(adaptive.history, base.history);
    }
}