use crate::config::TokenOptimizationConfig;
use crate::estimator::ConversationTokenEstimate;
#[derive(Debug, Clone, PartialEq)]
pub struct BudgetAllocation {
pub system_prompt: u32,
pub rag_context: u32,
pub tool_definitions: u32,
pub history: u32,
pub response_headroom: u32,
pub requires_compaction: bool,
pub pressure: f32,
}
const TOKENS_PER_TOOL_ESTIMATE: u32 = 150;
#[derive(Debug)]
pub struct TokenBudget {
context_window: u32,
response_headroom_ratio: f32,
compaction_trigger_ratio: f32,
system_prompt_budget_ratio: f32,
rag_budget_ratio: f32,
}
impl TokenBudget {
#[must_use]
pub fn new(config: &TokenOptimizationConfig) -> Self {
Self {
context_window: config.context_window_tokens,
response_headroom_ratio: config.response_headroom_ratio,
compaction_trigger_ratio: config.compaction_trigger_ratio,
system_prompt_budget_ratio: config.system_prompt_budget_ratio,
rag_budget_ratio: config.rag_budget_ratio,
}
}
#[must_use]
pub fn allocate(
&self,
estimate: &ConversationTokenEstimate,
has_tools: bool,
tool_count: usize,
) -> BudgetAllocation {
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let response_headroom =
(f64::from(self.context_window) * f64::from(self.response_headroom_ratio)) as u32;
let available = self.context_window.saturating_sub(response_headroom);
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let system_prompt_cap =
(f64::from(available) * f64::from(self.system_prompt_budget_ratio)) as u32;
let system_prompt = estimate.system_prompt.min(system_prompt_cap);
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let rag_cap = (f64::from(available) * f64::from(self.rag_budget_ratio)) as u32;
let rag_context = rag_cap;
#[allow(clippy::cast_possible_truncation)]
let tool_definitions = if has_tools {
(tool_count as u32) * TOKENS_PER_TOOL_ESTIMATE
} else {
0
};
let used_by_fixed = system_prompt + tool_definitions;
let history_budget = available.saturating_sub(used_by_fixed);
let history_actual = estimate.history + estimate.summary;
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let compaction_threshold =
(f64::from(history_budget) * f64::from(self.compaction_trigger_ratio)) as u32;
let requires_compaction = history_actual > compaction_threshold && history_actual > 0;
let total_used = estimate.total + tool_definitions;
#[allow(clippy::cast_possible_truncation)]
let pressure = if available > 0 {
(f64::from(total_used) / f64::from(available)) as f32
} else {
1.0
};
BudgetAllocation {
system_prompt,
rag_context,
tool_definitions,
history: history_budget,
response_headroom,
requires_compaction,
pressure: pressure.min(1.0),
}
}
#[must_use]
pub fn allocate_with_pressure_priority(
&self,
estimate: &ConversationTokenEstimate,
has_tools: bool,
tool_count: usize,
) -> BudgetAllocation {
let base = self.allocate(estimate, has_tools, tool_count);
if base.pressure < 0.9 {
return base;
}
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let response_headroom =
(f64::from(self.context_window) * f64::from(self.response_headroom_ratio)) as u32;
let available = self.context_window.saturating_sub(response_headroom);
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let system_prompt_cap =
(f64::from(available) * f64::from(self.system_prompt_budget_ratio)) as u32;
let system_prompt = estimate.system_prompt.min(system_prompt_cap);
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let rag_cap = (f64::from(available) * f64::from(self.rag_budget_ratio)) as u32;
let rag_context = rag_cap;
let after_system = available.saturating_sub(system_prompt);
#[allow(clippy::cast_possible_truncation)]
let tool_estimate = if has_tools {
(tool_count as u32) * TOKENS_PER_TOOL_ESTIMATE
} else {
0
};
let tool_definitions = tool_estimate.min(after_system / 2);
let history_budget = after_system.saturating_sub(tool_definitions);
let history_actual = estimate.history + estimate.summary;
let requires_compaction = history_actual > history_budget && history_actual > 0;
BudgetAllocation {
system_prompt,
rag_context,
tool_definitions,
history: history_budget,
response_headroom,
requires_compaction,
pressure: base.pressure,
}
}
#[must_use]
pub fn allocate_adaptive(
&self,
estimate: &ConversationTokenEstimate,
has_rag: bool,
has_tools: bool,
tool_count: usize,
) -> BudgetAllocation {
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let response_headroom =
(f64::from(self.context_window) * f64::from(self.response_headroom_ratio)) as u32;
let available = self.context_window.saturating_sub(response_headroom);
let (effective_system_ratio, effective_rag_ratio) = if has_rag {
(self.system_prompt_budget_ratio, self.rag_budget_ratio)
} else {
(
self.system_prompt_budget_ratio + self.rag_budget_ratio,
0.0_f32,
)
};
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let system_prompt_cap = (f64::from(available) * f64::from(effective_system_ratio)) as u32;
let system_prompt = estimate.system_prompt.min(system_prompt_cap);
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let rag_context = (f64::from(available) * f64::from(effective_rag_ratio)) as u32;
#[allow(clippy::cast_possible_truncation)]
let tool_definitions = if has_tools {
(tool_count as u32) * TOKENS_PER_TOOL_ESTIMATE
} else {
0
};
let used_by_fixed = system_prompt + tool_definitions;
let history_budget = available.saturating_sub(used_by_fixed);
let history_actual = estimate.history + estimate.summary;
#[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)]
let compaction_threshold =
(f64::from(history_budget) * f64::from(self.compaction_trigger_ratio)) as u32;
let requires_compaction = history_actual > compaction_threshold && history_actual > 0;
let total_used = estimate.total + tool_definitions;
#[allow(clippy::cast_possible_truncation)]
let pressure = if available > 0 {
(f64::from(total_used) / f64::from(available)) as f32
} else {
1.0
};
BudgetAllocation {
system_prompt,
rag_context,
tool_definitions,
history: history_budget,
response_headroom,
requires_compaction,
pressure: pressure.min(1.0),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::TokenOptimizationConfig;
fn default_budget() -> TokenBudget {
TokenBudget::new(&TokenOptimizationConfig::default())
}
#[test]
fn empty_conversation_no_compaction() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 0,
summary: 0,
history: 0,
total: 0,
};
let alloc = budget.allocate(&estimate, false, 0);
assert!(!alloc.requires_compaction);
assert!(alloc.pressure < f32::EPSILON);
}
#[test]
fn response_headroom_always_reserved() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 100,
summary: 0,
history: 6000,
total: 6100,
};
let alloc = budget.allocate(&estimate, false, 0);
assert_eq!(alloc.response_headroom, 2048);
}
#[test]
fn heavy_history_triggers_compaction() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 100,
summary: 0,
history: 5000,
total: 5100,
};
let alloc = budget.allocate(&estimate, false, 0);
assert!(alloc.requires_compaction);
}
#[test]
fn tools_reduce_history_budget() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 100,
summary: 0,
history: 2000,
total: 2100,
};
let without_tools = budget.allocate(&estimate, false, 0);
let with_tools = budget.allocate(&estimate, true, 5);
assert!(with_tools.history < without_tools.history);
assert_eq!(with_tools.tool_definitions, 5 * TOKENS_PER_TOOL_ESTIMATE);
}
#[test]
fn pressure_increases_with_usage() {
let budget = default_budget();
let low = ConversationTokenEstimate {
system_prompt: 50,
summary: 0,
history: 200,
total: 250,
};
let high = ConversationTokenEstimate {
system_prompt: 200,
summary: 100,
history: 5000,
total: 5300,
};
let low_alloc = budget.allocate(&low, false, 0);
let high_alloc = budget.allocate(&high, false, 0);
assert!(high_alloc.pressure > low_alloc.pressure);
}
#[test]
fn system_prompt_capped_at_budget_ratio() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 2000, summary: 0,
history: 100,
total: 2100,
};
let alloc = budget.allocate(&estimate, false, 0);
assert!(alloc.system_prompt <= 922); assert!(alloc.system_prompt < 2000);
}
#[test]
fn pressure_priority_low_pressure_same_as_base() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 100,
summary: 0,
history: 500,
total: 600,
};
let base = budget.allocate(&estimate, true, 3);
let priority = budget.allocate_with_pressure_priority(&estimate, true, 3);
assert_eq!(base, priority);
}
#[test]
fn pressure_priority_system_prompt_protected() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 800,
summary: 200,
history: 5000,
total: 6000,
};
let alloc = budget.allocate_with_pressure_priority(&estimate, true, 10);
assert!(alloc.system_prompt >= 800);
assert!(alloc.requires_compaction);
}
#[test]
fn pressure_priority_tools_capped_under_pressure() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 500,
summary: 500,
history: 5000,
total: 6000,
};
let base = budget.allocate(&estimate, true, 20);
let priority = budget.allocate_with_pressure_priority(&estimate, true, 20);
assert!(priority.tool_definitions <= base.tool_definitions);
}
#[test]
fn adaptive_no_rag_boosts_system_prompt_cap() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 1500, summary: 0,
history: 500,
total: 2000,
};
let with_rag = budget.allocate_adaptive(&estimate, true, false, 0);
let without_rag = budget.allocate_adaptive(&estimate, false, false, 0);
assert!(
without_rag.system_prompt > with_rag.system_prompt,
"Without RAG, system prompt cap should be higher: {} vs {}",
without_rag.system_prompt,
with_rag.system_prompt
);
}
#[test]
fn adaptive_no_rag_rag_context_is_zero() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 100,
summary: 0,
history: 500,
total: 600,
};
let alloc = budget.allocate_adaptive(&estimate, false, false, 0);
assert_eq!(alloc.rag_context, 0);
}
#[test]
fn adaptive_with_rag_preserves_rag_budget() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 100,
summary: 0,
history: 500,
total: 600,
};
let alloc = budget.allocate_adaptive(&estimate, true, false, 0);
assert!(alloc.rag_context > 0);
assert_eq!(
alloc.rag_context,
budget.allocate(&estimate, false, 0).rag_context
);
}
#[test]
fn adaptive_no_rag_gives_more_history() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 100,
summary: 0,
history: 3000,
total: 3100,
};
let with_rag = budget.allocate_adaptive(&estimate, true, false, 0);
let without_rag = budget.allocate_adaptive(&estimate, false, false, 0);
assert_eq!(without_rag.history, with_rag.history);
}
#[test]
fn adaptive_matches_allocate_when_rag_present_no_tools() {
let budget = default_budget();
let estimate = ConversationTokenEstimate {
system_prompt: 100,
summary: 0,
history: 500,
total: 600,
};
let adaptive = budget.allocate_adaptive(&estimate, true, false, 0);
let base = budget.allocate(&estimate, false, 0);
assert_eq!(adaptive.system_prompt, base.system_prompt);
assert_eq!(adaptive.rag_context, base.rag_context);
assert_eq!(adaptive.history, base.history);
}
}