use tracing::{debug, info, instrument, warn};
use crate::ports::SummarizationPort;
use crate::types::{Conversation, ToolDefinition};
use crate::budget::{BudgetAllocation, TokenBudget};
use crate::config::TokenOptimizationConfig;
use crate::error::TokenOptError;
use crate::estimator::{ConversationTokenEstimate, TokenEstimator};
use crate::history::compactor::{CompactionResult, HistoryCompactor};
use crate::prompt::system_prompt::{PromptContext, inject_conciseness, optimize_system_prompt};
use crate::stream::repetition::RepetitionDetector;
use crate::tools::schema_compressor::compress_tool_definitions;
use crate::tools::selector::select_tools;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct OptimizationStep {
pub name: &'static str,
pub estimated_savings: u32,
}
#[derive(Debug, Default)]
pub struct OptimizationPlan {
pub steps: Vec<OptimizationStep>,
}
impl OptimizationPlan {
#[must_use]
pub fn build(
conversation: &crate::types::Conversation,
allocation: &BudgetAllocation,
tool_result_max_tokens: u32,
) -> Self {
let mut steps = Vec::new();
let total_msgs = conversation.messages.len();
let decay_cutoff = total_msgs.saturating_sub(10); let rag_savings: u32 = conversation.messages[..decay_cutoff]
.iter()
.filter(|m| {
m.content.contains("YOUR MEMORY:") || m.content.contains("YOUR KNOWLEDGE GRAPH:")
})
.map(|m| TokenEstimator::estimate_tokens(&m.content))
.sum();
if rag_savings > 0 {
steps.push(OptimizationStep {
name: "rag_cross_turn_dedup",
estimated_savings: rag_savings,
});
}
let last_user_idx = conversation
.messages
.iter()
.rposition(|m| m.role == crate::types::MessageRole::User)
.unwrap_or(conversation.messages.len());
let tool_savings: u32 = conversation.messages[..last_user_idx]
.iter()
.filter(|m| m.role == crate::types::MessageRole::Tool)
.map(|m| {
let tokens = TokenEstimator::estimate_tokens(&m.content);
tokens.saturating_sub(tool_result_max_tokens.min(tokens))
})
.sum();
if tool_savings > 0 {
steps.push(OptimizationStep {
name: "tool_result_compression",
estimated_savings: tool_savings,
});
}
if let Some(ref prompt) = conversation.system_prompt {
let prompt_tokens = TokenEstimator::estimate_tokens(prompt);
let prompt_savings = prompt_tokens.saturating_sub(allocation.system_prompt);
if prompt_savings > 0 {
steps.push(OptimizationStep {
name: "system_prompt_trim",
estimated_savings: prompt_savings,
});
}
}
steps.sort_by(|a, b| b.estimated_savings.cmp(&a.estimated_savings));
Self { steps }
}
#[must_use]
pub fn total_estimated_savings(&self) -> u32 {
self.steps.iter().map(|s| s.estimated_savings).sum()
}
}
#[derive(Debug)]
pub struct OptimizationResult {
pub budget: BudgetAllocation,
pub estimate_before: ConversationTokenEstimate,
pub estimate_after: ConversationTokenEstimate,
pub compaction: Option<CompactionResult>,
pub system_prompt_trimmed: bool,
pub recommended_max_tokens: Option<u32>,
pub plan: OptimizationPlan,
}
#[derive(Debug)]
pub struct TokenOptimizer {
config: TokenOptimizationConfig,
budget: TokenBudget,
compactor: HistoryCompactor,
calibrator: Option<std::sync::RwLock<crate::estimator_tuning::EstimationCalibrator>>,
metrics: Option<std::sync::Arc<crate::metrics::OptimizationMetrics>>,
#[cfg(feature = "hf-tokenizer")]
hf_estimator: Option<std::sync::Arc<crate::estimator_hf::HfTokenEstimator>>,
}
impl TokenOptimizer {
#[must_use]
pub fn new(config: TokenOptimizationConfig) -> Self {
let budget = TokenBudget::new(&config);
let compactor = HistoryCompactor::new(config.max_summary_tokens);
#[cfg(feature = "hf-tokenizer")]
let hf_estimator = init_hf_estimator(&config);
Self {
config,
budget,
compactor,
calibrator: None,
metrics: None,
#[cfg(feature = "hf-tokenizer")]
hf_estimator,
}
}
#[must_use]
pub fn with_calibration(mut self) -> Self {
self.calibrator = Some(std::sync::RwLock::new(
crate::estimator_tuning::EstimationCalibrator::new(),
));
self
}
#[must_use]
pub fn with_metrics(
mut self,
metrics: std::sync::Arc<crate::metrics::OptimizationMetrics>,
) -> Self {
self.metrics = Some(metrics);
self
}
#[cfg(feature = "hf-tokenizer")]
#[must_use]
pub fn with_hf_tokenizer(mut self, estimator: crate::estimator_hf::HfTokenEstimator) -> Self {
self.hf_estimator = Some(std::sync::Arc::new(estimator));
self
}
fn estimate_conversation(&self, conversation: &Conversation) -> ConversationTokenEstimate {
#[cfg(feature = "hf-tokenizer")]
if let Some(ref hf) = self.hf_estimator {
return hf.count_conversation_tokens(conversation);
}
TokenEstimator::estimate_conversation(conversation)
}
fn estimate_tokens(&self, text: &str) -> u32 {
#[cfg(feature = "hf-tokenizer")]
if let Some(ref hf) = self.hf_estimator {
return hf.count_tokens(text);
}
TokenEstimator::estimate_tokens(text)
}
fn estimate_tool_definitions(&self, tools: &[ToolDefinition]) -> u32 {
#[cfg(feature = "hf-tokenizer")]
if let Some(ref hf) = self.hf_estimator {
return hf.count_tool_definitions_tokens(tools);
}
TokenEstimator::estimate_tool_definitions(tools)
}
#[cfg(test)]
fn cap_output_tokens(&self, tokens: u32) -> u32 {
self.config
.output_max_tokens
.map_or(tokens, |cap| tokens.min(cap))
}
pub fn report_actual_tokens(&self, model: &str, estimated: u32, actual: u32) {
if let Some(ref cal) = self.calibrator {
if let Ok(mut guard) = cal.write() {
guard.record_observation(model, estimated, actual);
}
}
}
#[must_use]
pub fn is_enabled(&self) -> bool {
self.config.enabled
}
#[must_use]
pub fn config(&self) -> &TokenOptimizationConfig {
&self.config
}
#[allow(clippy::too_many_lines)]
#[instrument(skip(self, conversation, inference), fields(
msgs = conversation.messages.len(),
enabled = self.config.enabled,
))]
pub async fn optimize_conversation(
&self,
conversation: &mut Conversation,
inference: Option<&dyn SummarizationPort>,
) -> Result<OptimizationResult, TokenOptError> {
let estimate_before = self.estimate_conversation(conversation);
debug!(
system_prompt = estimate_before.system_prompt,
summary = estimate_before.summary,
history = estimate_before.history,
total = estimate_before.total,
"Token estimate before optimization"
);
let allocation = self
.budget
.allocate_adaptive(&estimate_before, false, false, 0);
let plan = OptimizationPlan::build(
conversation,
&allocation,
self.config.tool_result_max_tokens,
);
if !plan.steps.is_empty() {
debug!(
steps = ?plan.steps.iter().map(|s| format!("{}(~{}t)", s.name, s.estimated_savings)).collect::<Vec<_>>(),
total_estimated_savings = plan.total_estimated_savings(),
"Impact-ordered optimization plan"
);
}
let current_turn = conversation.messages.len() / 2;
let rag_dedup_saved = crate::prompt::rag_cross_turn_dedup::deduplicate_rag_across_turns(
&mut conversation.messages,
current_turn,
None,
);
if rag_dedup_saved > 0 {
debug!(
tokens_saved = rag_dedup_saved,
"Cross-turn RAG dedup applied"
);
}
let tool_result_saved = crate::tools::result_truncator::compress_old_tool_results(
&mut conversation.messages,
self.config.tool_result_max_tokens,
);
if tool_result_saved > 0 {
debug!(
tokens_saved = tool_result_saved,
"Old tool results compressed"
);
}
let estimate_after_lightweight = self.estimate_conversation(conversation);
let allocation =
self.budget
.allocate_adaptive(&estimate_after_lightweight, false, false, 0);
let compaction = if allocation.requires_compaction {
debug!(
history_budget = allocation.history,
history_actual =
estimate_after_lightweight.history + estimate_after_lightweight.summary,
pressure = allocation.pressure,
"History compaction triggered"
);
match self
.compactor
.compact(conversation, &allocation, inference)
.await
{
Ok(result) => {
info!(
strategy = ?result.strategy,
messages_removed = result.messages_removed,
tokens_saved = result.tokens_saved,
"History compacted"
);
Some(result)
},
Err(e) => {
warn!(error = %e, "History compaction failed, continuing without");
None
},
}
} else {
None
};
let system_prompt_trimmed = if let Some(ref prompt) = conversation.system_prompt {
#[cfg(not(feature = "pisovereign"))]
let prompt_tokens = conversation
.cached_prompt_tokens
.unwrap_or_else(|| self.estimate_tokens(prompt));
#[cfg(feature = "pisovereign")]
let prompt_tokens = self.estimate_tokens(prompt);
if prompt_tokens > allocation.system_prompt {
let context = PromptContext::new(
false,
prompt.contains("Context:") || prompt.contains("context:"),
);
let optimized = optimize_system_prompt(prompt, allocation.system_prompt, &context);
#[cfg(not(feature = "pisovereign"))]
{
conversation.cached_prompt_tokens = Some(self.estimate_tokens(&optimized));
}
conversation.system_prompt = Some(optimized);
true
} else {
#[cfg(not(feature = "pisovereign"))]
if conversation.cached_prompt_tokens.is_none() {
conversation.cached_prompt_tokens = Some(prompt_tokens);
}
false
}
} else {
false
};
if system_prompt_trimmed {
if let Some(ref prompt) = conversation.system_prompt {
let concise = inject_conciseness(
prompt,
f64::from(allocation.pressure),
self.config.conciseness_pressure_threshold,
);
if concise.len() != prompt.len() {
conversation.system_prompt = Some(concise);
}
}
}
let estimate_after = self.estimate_conversation(conversation);
if estimate_before.total > 0 {
let saved = estimate_before.total.saturating_sub(estimate_after.total);
let reduction_pct = (f64::from(saved) / f64::from(estimate_before.total)) * 100.0;
info!(
before = estimate_before.total,
after = estimate_after.total,
saved,
reduction_pct = format_args!("{reduction_pct:.1}%"),
"Conversation optimized"
);
}
let recommended_max_tokens: Option<u32> = self.config.output_max_tokens;
if let Some(ref metrics) = self.metrics {
let strategy_name = compaction.as_ref().map_or("none", |c| match c.strategy {
crate::history::compactor::CompactionStrategy::None => "none",
crate::history::compactor::CompactionStrategy::Deduplication => "dedup",
crate::history::compactor::CompactionStrategy::Lossless => "lossless",
crate::history::compactor::CompactionStrategy::Extractive => "extractive",
crate::history::compactor::CompactionStrategy::Paraphrasing => "paraphrasing",
crate::history::compactor::CompactionStrategy::LlmFallback => "llm",
});
metrics.record_optimization(estimate_before.total, estimate_after.total, strategy_name);
if recommended_max_tokens.is_some() {
metrics.record_output_cap();
}
}
Ok(OptimizationResult {
budget: allocation,
estimate_before,
estimate_after,
compaction,
system_prompt_trimmed,
recommended_max_tokens,
plan,
})
}
#[allow(clippy::too_many_lines)]
#[instrument(skip(self, conversation, tools, inference), fields(
msgs = conversation.messages.len(),
tools = tools.len(),
))]
pub async fn optimize_conversation_with_tools(
&self,
conversation: &mut Conversation,
tools: &[ToolDefinition],
inference: Option<&dyn SummarizationPort>,
) -> Result<OptimizationResult, TokenOptError> {
let estimate_before = self.estimate_conversation(conversation);
let allocation = self
.budget
.allocate_adaptive(&estimate_before, false, true, tools.len());
let plan = OptimizationPlan::build(
conversation,
&allocation,
self.config.tool_result_max_tokens,
);
let current_turn = conversation.messages.len() / 2;
crate::prompt::rag_cross_turn_dedup::deduplicate_rag_across_turns(
&mut conversation.messages,
current_turn,
None,
);
crate::tools::result_truncator::compress_old_tool_results(
&mut conversation.messages,
self.config.tool_result_max_tokens,
);
let estimate_after_lightweight = self.estimate_conversation(conversation);
let allocation =
self.budget
.allocate_adaptive(&estimate_after_lightweight, false, true, tools.len());
let compaction = if allocation.requires_compaction {
match self
.compactor
.compact(conversation, &allocation, inference)
.await
{
Ok(result) => {
info!(
strategy = ?result.strategy,
messages_removed = result.messages_removed,
tokens_saved = result.tokens_saved,
"History compacted (tool-calling context)"
);
Some(result)
},
Err(e) => {
warn!(error = %e, "History compaction failed, continuing without");
None
},
}
} else {
None
};
let system_prompt_trimmed = if let Some(ref prompt) = conversation.system_prompt {
#[cfg(not(feature = "pisovereign"))]
let prompt_tokens = conversation
.cached_prompt_tokens
.unwrap_or_else(|| self.estimate_tokens(prompt));
#[cfg(feature = "pisovereign")]
let prompt_tokens = self.estimate_tokens(prompt);
if prompt_tokens > allocation.system_prompt {
let context = PromptContext::new(
true,
prompt.contains("Context:") || prompt.contains("context:"),
);
let optimized = optimize_system_prompt(prompt, allocation.system_prompt, &context);
#[cfg(not(feature = "pisovereign"))]
{
conversation.cached_prompt_tokens = Some(self.estimate_tokens(&optimized));
}
conversation.system_prompt = Some(optimized);
true
} else {
#[cfg(not(feature = "pisovereign"))]
if conversation.cached_prompt_tokens.is_none() {
conversation.cached_prompt_tokens = Some(prompt_tokens);
}
false
}
} else {
false
};
if system_prompt_trimmed {
if let Some(ref prompt) = conversation.system_prompt {
let concise = inject_conciseness(
prompt,
f64::from(allocation.pressure),
self.config.conciseness_pressure_threshold,
);
if concise.len() != prompt.len() {
conversation.system_prompt = Some(concise);
}
}
}
let estimate_after = self.estimate_conversation(conversation);
let recommended_max_tokens: Option<u32> = self.config.output_max_tokens;
if let Some(ref metrics) = self.metrics {
metrics.record_optimization(estimate_before.total, estimate_after.total, "tools");
}
Ok(OptimizationResult {
budget: allocation,
estimate_before,
estimate_after,
compaction,
system_prompt_trimmed,
recommended_max_tokens,
plan,
})
}
#[must_use]
pub fn optimize_tools(&self, query: &str, tools: &[ToolDefinition]) -> Vec<ToolDefinition> {
if tools.is_empty() {
return Vec::new();
}
let selected = select_tools(query, tools, self.config.max_tools_per_request);
let before_count = tools.len();
let after_count = selected.len();
let before_tokens = self.estimate_tool_definitions(tools);
let after_tokens = self.estimate_tool_definitions(&selected);
let compressed = compress_tool_definitions(&selected);
let compressed_tokens = self.estimate_tool_definitions(&compressed);
debug!(
before_count,
after_count, before_tokens, after_tokens, compressed_tokens, "Tools optimized"
);
compressed
}
#[must_use]
pub fn optimize_tools_progressive(
&self,
query: &str,
tools: &[ToolDefinition],
tracker: &crate::tools::progressive::ToolUsageTracker,
) -> Vec<ToolDefinition> {
let base = self.optimize_tools(query, tools);
crate::tools::progressive::compress_progressively(&base, tracker)
}
pub async fn optimize_prompt(
&self,
text: &str,
inference: Option<&dyn SummarizationPort>,
) -> Result<crate::types::OptimizedPrompt, TokenOptError> {
let tokens_before = self.estimate_tokens(text);
let mut conv = Conversation::new();
conv.add_user_message(text);
let result = self.optimize_conversation(&mut conv, inference).await?;
let optimized_text = conv
.messages
.first()
.map_or_else(String::new, |m| m.content.clone());
let tokens_after = self.estimate_tokens(&optimized_text);
let complexity = crate::output::complexity::classify_query(text);
#[allow(clippy::cast_precision_loss)]
let reduction_ratio = if tokens_before > 0 {
(tokens_before - tokens_after) as f32 / tokens_before as f32
} else {
0.0
};
let mut strategies = Vec::new();
if result.system_prompt_trimmed {
strategies.push("system_prompt_trim".to_string());
}
if result.compaction.is_some() {
strategies.push("history_compaction".to_string());
}
Ok(crate::types::OptimizedPrompt {
text: optimized_text,
recommended_max_tokens: self.config.output_max_tokens,
tokens_estimated: tokens_after,
strategies_applied: strategies,
metadata: crate::types::OptimizationMetadata {
tokens_before,
tokens_after,
reduction_ratio,
complexity: Some(complexity),
},
})
}
#[must_use]
pub fn create_stream_monitor(&self) -> Option<RepetitionDetector> {
if self.config.repetition_detection_enabled {
Some(RepetitionDetector::new(
self.config.repetition_ngram_size,
self.config.repetition_threshold,
))
} else {
None
}
}
}
#[cfg(feature = "hf-tokenizer")]
fn init_hf_estimator(
config: &TokenOptimizationConfig,
) -> Option<std::sync::Arc<crate::estimator_hf::HfTokenEstimator>> {
use std::path::Path;
let model = config.tokenizer_model.as_deref()?;
let path = Path::new(model);
if path.exists() {
match crate::estimator_hf::HfTokenEstimator::from_file(path) {
Ok(est) => {
info!(path = model, "Using local HuggingFace tokenizer");
return Some(std::sync::Arc::new(est));
},
Err(e) => {
warn!(error = %e, path = model, "Failed to load local tokenizer");
},
}
}
match crate::estimator_hf::HfTokenEstimator::from_pretrained(model) {
Ok(est) => {
info!(model, "Initialized HuggingFace tokenizer from Hub");
Some(std::sync::Arc::new(est))
},
Err(e) => {
warn!(
error = %e,
model,
"Failed to initialize HuggingFace tokenizer, using heuristic estimation"
);
None
},
}
}
#[cfg(test)]
mod tests {
use super::*;
fn default_config() -> TokenOptimizationConfig {
TokenOptimizationConfig::default()
}
#[test]
fn optimizer_creates_with_defaults() {
let optimizer = TokenOptimizer::new(default_config());
assert!(optimizer.is_enabled());
}
#[test]
fn disabled_optimizer_reports_disabled() {
let mut config = default_config();
config.enabled = false;
let optimizer = TokenOptimizer::new(config);
assert!(!optimizer.is_enabled());
}
#[tokio::test]
async fn optimize_small_conversation_no_compaction() {
let optimizer = TokenOptimizer::new(default_config());
let mut conv = Conversation::new();
conv.add_user_message("Hello");
conv.add_assistant_message("Hi there!");
let result = optimizer.optimize_conversation(&mut conv, None).await;
assert!(result.is_ok());
let result = result.expect("optimization should succeed");
assert!(result.compaction.is_none());
assert!(!result.system_prompt_trimmed);
}
#[tokio::test]
async fn optimize_large_conversation_triggers_compaction() {
let optimizer = TokenOptimizer::new(default_config());
let mut conv = Conversation::with_system_prompt("You are a helpful assistant.");
for i in 0..60 {
conv.add_user_message(format!(
"Message number {i} with a significant amount of content designed to take \
up token space and push us over the budget threshold. This message includes \
additional context about the topic, references to previous conversations, \
and detailed questions that require substantial processing. Furthermore, \
each message contributes to the overall token count which should eventually \
exceed the configured compaction trigger ratio of the context window."
));
conv.add_assistant_message(format!(
"Response number {i} with a comprehensive and detailed answer about the \
topic including various facts, figures, explanations, and recommendations. \
This response is intentionally verbose to ensure that the conversation \
history grows large enough to trigger the compaction mechanism. It covers \
multiple aspects of the question and provides thorough analysis with \
supporting evidence, examples, and actionable next steps for the user."
));
}
let result = optimizer.optimize_conversation(&mut conv, None).await;
assert!(result.is_ok());
let result = result.expect("optimization should succeed");
assert!(result.compaction.is_some());
assert!(result.estimate_after.total < result.estimate_before.total);
}
#[test]
fn optimize_tools_reduces_count() {
let optimizer = TokenOptimizer::new(default_config());
let tools: Vec<ToolDefinition> = (0..15)
.map(|i| ToolDefinition {
name: format!("tool_{i}"),
description: format!("This tool does something related to task {i}"),
parameters: crate::types::ToolParameters {
schema_type: "object".to_string(),
properties: std::collections::HashMap::new(),
required: Vec::new(),
},
icon: None,
})
.collect();
let selected = optimizer.optimize_tools("I need tool_1 for my task", &tools);
assert!(selected.len() <= 8); }
#[test]
fn create_stream_monitor_when_enabled() {
let optimizer = TokenOptimizer::new(default_config());
assert!(optimizer.create_stream_monitor().is_some());
}
#[test]
fn no_stream_monitor_when_disabled() {
let mut config = default_config();
config.repetition_detection_enabled = false;
let optimizer = TokenOptimizer::new(config);
assert!(optimizer.create_stream_monitor().is_none());
}
#[test]
fn optimize_empty_tools_returns_empty() {
let optimizer = TokenOptimizer::new(default_config());
let result = optimizer.optimize_tools("query", &[]);
assert!(result.is_empty());
}
#[test]
fn cap_output_tokens_no_cap() {
let optimizer = TokenOptimizer::new(default_config());
assert_eq!(optimizer.cap_output_tokens(2048), 2048);
}
#[test]
fn cap_output_tokens_within_cap() {
let mut config = default_config();
config.output_max_tokens = Some(1024);
let optimizer = TokenOptimizer::new(config);
assert_eq!(optimizer.cap_output_tokens(512), 512);
}
#[test]
fn cap_output_tokens_exceeds_cap() {
let mut config = default_config();
config.output_max_tokens = Some(1024);
let optimizer = TokenOptimizer::new(config);
assert_eq!(optimizer.cap_output_tokens(2048), 1024);
}
#[tokio::test]
async fn optimize_conversation_returns_no_max_tokens_by_default() {
let optimizer = TokenOptimizer::new(default_config());
let mut conv = Conversation::new();
conv.add_user_message("What is the weather today?");
conv.add_assistant_message("It's sunny.");
conv.add_user_message("Thanks!");
let result = optimizer
.optimize_conversation(&mut conv, None)
.await
.expect("optimization should succeed");
assert!(result.recommended_max_tokens.is_none());
}
#[tokio::test]
async fn optimize_conversation_caps_output_tokens() {
let mut config = default_config();
config.output_max_tokens = Some(64);
let optimizer = TokenOptimizer::new(config);
let mut conv = Conversation::new();
conv.add_user_message("Explain quantum mechanics in detail please.");
let result = optimizer
.optimize_conversation(&mut conv, None)
.await
.expect("optimization should succeed");
assert_eq!(result.recommended_max_tokens, Some(64));
}
#[test]
fn optimization_plan_empty_for_clean_conversation() {
let conv = Conversation::new();
let allocation = BudgetAllocation {
system_prompt: 512,
rag_context: 1024,
tool_definitions: 512,
history: 4096,
response_headroom: 1024,
requires_compaction: false,
pressure: 0.1,
};
let plan = OptimizationPlan::build(&conv, &allocation, 100);
assert!(
plan.steps.is_empty(),
"empty conversation should produce no optimization steps"
);
assert_eq!(plan.total_estimated_savings(), 0);
}
#[test]
fn optimization_plan_total_estimated_savings_sums_steps() {
let plan = OptimizationPlan {
steps: vec![
OptimizationStep {
name: "step_a",
estimated_savings: 200,
},
OptimizationStep {
name: "step_b",
estimated_savings: 300,
},
],
};
assert_eq!(plan.total_estimated_savings(), 500);
}
#[cfg(not(feature = "pisovereign"))]
fn make_tool_msg(content: impl Into<String>) -> crate::types::ChatMessage {
crate::types::ChatMessage::tool(content)
}
#[cfg(feature = "pisovereign")]
fn make_tool_msg(content: impl Into<String>) -> crate::types::ChatMessage {
crate::types::ChatMessage::tool("tool_call_id", content)
}
#[test]
fn optimization_plan_sorted_descending_by_savings() {
let mut conv = Conversation::with_system_prompt(
"A".repeat(4096),
);
for _ in 0..5 {
conv.messages.push(make_tool_msg("T".repeat(2048)));
}
conv.add_user_message("query");
let allocation = BudgetAllocation {
system_prompt: 10,
rag_context: 1024,
tool_definitions: 512,
history: 4096,
response_headroom: 1024,
requires_compaction: true,
pressure: 0.95,
};
let plan = OptimizationPlan::build(&conv, &allocation, 10);
let savings: Vec<u32> = plan.steps.iter().map(|s| s.estimated_savings).collect();
let mut sorted = savings.clone();
sorted.sort_by(|a, b| b.cmp(a));
assert_eq!(
savings, sorted,
"plan steps must be sorted descending by savings"
);
}
#[tokio::test]
async fn optimize_conversation_result_includes_plan() {
let optimizer = TokenOptimizer::new(default_config());
let mut conv = Conversation::new();
conv.add_user_message("What is 2+2?");
conv.add_assistant_message("4");
conv.add_user_message("Thanks");
let result = optimizer
.optimize_conversation(&mut conv, None)
.await
.expect("optimization should succeed");
let _ = result.plan.total_estimated_savings(); }
}