use crate::config::Config;
use crate::session::chat::get_animation_manager;
use crate::session::chat::session::ChatSession;
use crate::session::estimate_tokens;
use crate::{log_debug, log_info};
use anyhow::Result;
pub async fn should_check_compression(session: &mut ChatSession, config: &Config) -> (bool, f64) {
let current_tokens = session.get_full_context_tokens(config).await;
if !config.compression.adaptive_threshold {
if config.max_session_tokens_threshold > 0
&& current_tokens >= config.max_session_tokens_threshold
{
let ratio = config
.compression
.pressure_levels
.iter()
.map(|l| l.target_ratio)
.fold(2.0_f64, f64::max);
log_debug!(
"Max session token threshold exceeded ({} >= {}) - triggering compression with ratio {:.1}x",
current_tokens,
config.max_session_tokens_threshold,
ratio
);
return (true, ratio);
}
log_debug!("Adaptive compression disabled (adaptive_threshold=false)");
return (false, 2.0);
}
if config.compression.pressure_levels.is_empty() {
log_debug!("No pressure levels configured - compression disabled");
return (false, 2.0);
}
log_debug!(
"Compression check: current_tokens={}, thresholds={:?}",
current_tokens,
config
.compression
.pressure_levels
.iter()
.map(|l| l.threshold)
.collect::<Vec<_>>()
);
let matching_level = config
.compression
.pressure_levels
.iter()
.rev() .find(|level| current_tokens >= level.threshold);
match matching_level {
Some(level) => {
let adjusted_ratio = calculate_adaptive_compression_ratio(session, level.target_ratio);
log_debug!(
"✓ Threshold exceeded! Context tokens: {} → base compression: {:.1}x → adaptive: {:.1}x (threshold: {})",
current_tokens,
level.target_ratio,
adjusted_ratio,
level.threshold
);
let current_api_calls = session.session.info.total_api_calls;
let next_compression_allowed = session
.session
.info
.next_conversation_compression_at_api_call;
if current_api_calls < next_compression_allowed {
log_debug!(
"Compression cooldown active: current_api_calls={} < next_allowed={} (must wait {} more calls)",
current_api_calls,
next_compression_allowed,
next_compression_allowed - current_api_calls
);
return (false, 2.0);
}
log_debug!(
"Compression cooldown passed: current_api_calls={} >= next_allowed={}",
current_api_calls,
next_compression_allowed
);
let net_benefit = calculate_compression_net_benefit(
session,
config,
current_tokens,
adjusted_ratio, )
.await;
if net_benefit > 0.0 {
let (start_idx, end_idx) = match find_compression_range(
&session.session.messages,
session.first_prompt_idx,
) {
Ok(range) => range,
Err(e) => {
log_debug!("Failed to find compression range: {}", e);
return (false, 2.0);
}
};
if start_idx >= end_idx {
log_debug!(
"Invalid compression range ({} >= {}), skipping",
start_idx,
end_idx
);
return (false, 2.0);
}
let compressible_tokens = match calculate_range_tokens(session, start_idx, end_idx)
{
Ok(tokens) => tokens,
Err(e) => {
log_debug!("Failed to calculate range tokens: {}", e);
return (false, 2.0);
}
};
let estimated_compressed_size =
(compressible_tokens as f64 / adjusted_ratio) as u64;
let estimated_after_compression = (current_tokens as u64)
.saturating_sub(compressible_tokens)
.saturating_add(estimated_compressed_size);
if estimated_after_compression >= level.threshold as u64 {
log_debug!(
"Compression won't bring context below threshold: {} → {} (threshold: {}). Compressible: {} → {}. Skipping compression.",
current_tokens,
estimated_after_compression,
level.threshold,
compressible_tokens,
estimated_compressed_size
);
return (false, 2.0);
}
log_debug!(
"Cache-aware analysis: Net benefit ${:.5} → COMPRESS (will reduce {} → {} tokens, below threshold {})",
net_benefit,
current_tokens,
estimated_after_compression,
level.threshold
);
(true, adjusted_ratio)
} else {
log_debug!(
"Cache-aware analysis: Net benefit ${:.5} → SKIP (would lose money)",
net_benefit
);
(false, 2.0)
}
}
None => {
log_debug!(
"No threshold exceeded (current: {}, lowest threshold: {})",
current_tokens,
config
.compression
.pressure_levels
.first()
.map(|l| l.threshold)
.unwrap_or(0)
);
(false, 2.0)
}
}
}
async fn calculate_compression_net_benefit(
session: &ChatSession,
config: &crate::config::Config,
current_tokens: usize,
compression_ratio: f64,
) -> f64 {
let total_tokens = current_tokens as f64;
let estimated_future_turns = estimate_future_turns(session, current_tokens, compression_ratio);
let compressed_tokens = total_tokens / compression_ratio;
let decision_model = &config.compression.decision.model;
let session_model = &session.model;
let decision_pricing = get_model_pricing(decision_model, config);
let session_pricing = get_model_pricing(session_model, config);
let (decision_pricing, session_pricing) = match (decision_pricing, session_pricing) {
(Some(d), Some(s)) => (d, s),
_ => {
log_debug!(
"Cannot get pricing for models: decision='{}', session='{}' - skipping compression",
decision_model,
session_model
);
return -1.0; }
};
let session_is_free = session_pricing.input_price_per_1m == 0.0
&& session_pricing.output_price_per_1m == 0.0
&& session_pricing.cache_write_price_per_1m == 0.0
&& session_pricing.cache_read_price_per_1m == 0.0;
if session_is_free {
log_debug!(
"Session model '{}' has zero pricing - compressing for context management (threshold exceeded)",
session_model
);
return 1.0; }
let total_api_calls = session.session.info.total_api_calls.max(1) as f64;
let avg_new_tokens_per_call =
(session.session.info.output_tokens as f64 / total_api_calls).max(2000.0);
let decision_prompt_tokens = estimate_tokens(
"Analyze the conversation history. Should older exchanges be compressed into a summary to save context space while preserving important information? Consider:\n\
- Are there repetitive or resolved topics that can be summarized?\n\
- Is there important context that must be preserved?\n\
- Would compression help focus on current topics?\n\n\
If YES, also provide a 2-3 sentence summary preserving logical structure (focus on what's needed to continue the conversation):\n\n\
[context chunks placeholder - ~500 tokens average]\n\n\
Respond with:\n\
'YES' followed by the summary on the next line, OR\n\
'NO' if compression is not beneficial."
) as f64;
let same_model = decision_model == session_model;
let decision_max_tokens = config.compression.decision.max_tokens;
let estimated_output_tokens = if decision_max_tokens > 0 {
(compressed_tokens as u64).min(decision_max_tokens as u64)
} else {
compressed_tokens as u64
};
let mut total_cost_no_compress = 0.0;
let mut base_context = total_tokens;
for _ in 0..estimated_future_turns as i32 {
let context_cost = session_pricing.calculate_cost(
avg_new_tokens_per_call as u64, 0, base_context as u64, 0, );
total_cost_no_compress += context_cost;
base_context += avg_new_tokens_per_call;
}
let ignore_cost = config.compression.decision.ignore_cost;
let compression_cost = if same_model {
decision_pricing.calculate_cost(
decision_prompt_tokens as u64, 0, (total_tokens - decision_prompt_tokens) as u64, estimated_output_tokens, )
} else {
decision_pricing.calculate_cost(
total_tokens as u64, 0, 0, estimated_output_tokens, )
};
let mut total_cost_with_compress = if ignore_cost { 0.0 } else { compression_cost };
let mut base_context_compressed = compressed_tokens;
for call_num in 0..estimated_future_turns as i32 {
let (input_tokens, cache_write, cache_read) = if call_num == 0 {
(
avg_new_tokens_per_call as u64,
base_context_compressed as u64,
0,
)
} else {
(
avg_new_tokens_per_call as u64,
0,
base_context_compressed as u64,
)
};
let context_cost = session_pricing.calculate_cost(input_tokens, cache_write, cache_read, 0);
total_cost_with_compress += context_cost;
base_context_compressed += avg_new_tokens_per_call;
}
let net_benefit = total_cost_no_compress - total_cost_with_compress;
log_debug!(
"Compression analysis (REAL PRICING):\n \
Decision model: {} (input: ${:.2}/1M, output: ${:.2}/1M, cache_write: ${:.2}/1M, cache_read: ${:.2}/1M)\n \
Session model: {} (input: ${:.2}/1M, output: ${:.2}/1M, cache_write: ${:.2}/1M, cache_read: ${:.2}/1M)\n \
Models match: {} (cache reuse: {})\n \
Current: {:.0} tokens (decision prompt: ~{:.0} tokens)\n \
After compression: {:.0} tokens ({:.1}x ratio) - saves {:.0} tokens\n \
Avg new tokens/call: {:.0} (output_tokens={}, api_calls={})\n \
Future calls: {:.0}\n \
SCENARIO A (no compress): ${:.5}\n \
- Per call: cache_read(base) + input({:.0} new tokens)\n \
- Base grows: {:.0} → {:.0} tokens over {} calls\n \
SCENARIO B (compress): ${:.5}\n \
- Compression cost: ${:.5} (using {}, {} uncached, {} cached) {}\n \
- Per call: cache_read/write(base) + input({:.0} new tokens)\n \
- Base grows: {:.0} → {:.0} tokens over {} calls\n \
Net benefit: ${:.5} → {}",
decision_model,
decision_pricing.input_price_per_1m,
decision_pricing.output_price_per_1m,
decision_pricing.cache_write_price_per_1m,
decision_pricing.cache_read_price_per_1m,
session_model,
session_pricing.input_price_per_1m,
session_pricing.output_price_per_1m,
session_pricing.cache_write_price_per_1m,
session_pricing.cache_read_price_per_1m,
if same_model { "YES" } else { "NO" },
if same_model { "YES" } else { "NO" },
total_tokens,
decision_prompt_tokens,
compressed_tokens,
compression_ratio,
total_tokens - compressed_tokens,
avg_new_tokens_per_call,
session.session.info.output_tokens,
session.session.info.total_api_calls,
estimated_future_turns,
total_cost_no_compress,
avg_new_tokens_per_call,
total_tokens,
base_context,
estimated_future_turns as i32,
total_cost_with_compress,
compression_cost,
decision_model,
if same_model { decision_prompt_tokens as u64 } else { total_tokens as u64 },
if same_model { (total_tokens - decision_prompt_tokens) as u64 } else { 0 },
if ignore_cost { "[IGNORED]" } else { "" },
avg_new_tokens_per_call,
compressed_tokens,
base_context_compressed,
estimated_future_turns as i32,
net_benefit,
if net_benefit > 0.0 {
"COMPRESS ✓"
} else {
"SKIP"
}
);
net_benefit
}
fn get_model_pricing(
model: &str,
_config: &crate::config::Config,
) -> Option<crate::providers::ModelPricing> {
let parts: Vec<&str> = model.split(':').collect();
if parts.len() != 2 {
log_debug!(
"Invalid model format: '{}' (expected 'provider:model')",
model
);
return None;
}
let provider_name = parts[0];
let model_name = parts[1];
let provider = crate::providers::ProviderFactory::create_provider(provider_name).ok()?;
provider.get_model_pricing(model_name)
}
fn calculate_adaptive_compression_ratio(session: &ChatSession, base_ratio: f64) -> f64 {
let info = &session.session.info;
let current_api_calls = info.total_api_calls as f64;
if current_api_calls < 5.0 {
return base_ratio;
}
let tool_density = info.tool_calls as f64 / current_api_calls;
let has_plan = crate::mcp::dev::plan::core::has_active_plan();
let adjustment = if has_plan {
1.2
} else if tool_density > 2.5 {
1.15
} else if tool_density > 1.0 {
1.0
} else if tool_density > 0.3 {
0.9
} else {
0.8
};
let adaptive_ratio = base_ratio * adjustment;
let final_ratio = adaptive_ratio.clamp(1.5, 4.0);
crate::log_debug!(
"Adaptive compression ratio: base={:.1}, adjustment={:.2}, tool_density={:.2}, has_plan={}, final={:.1}",
base_ratio,
adjustment,
tool_density,
has_plan,
final_ratio
);
final_ratio
}
fn estimate_future_turns(
session: &ChatSession,
current_tokens: usize,
compression_ratio: f64,
) -> f64 {
let info = &session.session.info;
let api_calls = info.total_api_calls as f64;
let growth_rate = if info.compression_stats.conversation_compressions > 0 {
let calls_since = (info.total_api_calls - info.api_calls_at_last_compression).max(1) as f64;
let output_since = info
.output_tokens
.saturating_sub(info.output_tokens_at_last_compression) as f64;
(output_since / calls_since).max(1.0)
} else {
(info.output_tokens as f64 / api_calls.max(1.0)).max(1.0)
};
let compressed_tokens = current_tokens as f64 / compression_ratio;
let headroom = (current_tokens as f64 - compressed_tokens).max(0.0);
let physical_ceiling = headroom / growth_rate;
let estimate = if api_calls > 0.0 {
physical_ceiling.min(api_calls)
} else {
physical_ceiling.min(100.0)
};
let accuracy = calculate_self_tuning_accuracy(info);
let adjusted = (estimate * accuracy).max(5.0);
crate::log_debug!(
"Future calls estimation: api_calls={:.0}, growth_rate={:.0} tok/call ({}), \
headroom={:.0}, physical_ceiling={:.1}, symmetry={:.1}, accuracy={:.2}, final={:.0}",
api_calls,
growth_rate,
if info.compression_stats.conversation_compressions > 0 {
"incremental"
} else {
"lifetime"
},
headroom,
physical_ceiling,
if api_calls > 0.0 {
api_calls
} else {
physical_ceiling.min(100.0)
},
accuracy,
adjusted
);
adjusted
}
fn calculate_self_tuning_accuracy(info: &crate::session::SessionInfo) -> f64 {
if info.compression_stats.conversation_compressions == 0 {
return 1.0; }
let predicted = info.predicted_turns_at_last_compression;
let actual = (info.total_api_calls as f64 - info.api_calls_at_last_compression as f64).max(0.0);
if predicted <= 0.0 || actual <= 0.0 {
return 1.0;
}
let ratio = actual / predicted;
crate::log_debug!(
"Self-tuning: predicted={:.1}, actual={:.1}, correction={:.2}",
predicted,
actual,
ratio
);
ratio.clamp(0.25, 4.0)
}
pub async fn check_and_compress_conversation(
session: &mut ChatSession,
config: &Config,
operation_rx: tokio::sync::watch::Receiver<bool>,
) -> Result<bool> {
let (should_check, target_ratio) = should_check_compression(session, config).await;
if !should_check {
return Ok(false);
}
if *operation_rx.borrow() {
return Err(anyhow::anyhow!("Operation cancelled"));
}
let animation_manager = get_animation_manager();
let current_cost = session.session.info.total_cost;
let max_threshold = config.max_session_tokens_threshold;
let current_context_tokens = session.get_full_context_tokens(config).await as u64;
animation_manager
.start_with_params(current_cost, current_context_tokens, max_threshold)
.await;
log_debug!("Compression check triggered - asking AI for decision and summary in one call");
let (start_idx, end_idx) =
find_compression_range(&session.session.messages, session.first_prompt_idx)?;
if start_idx >= end_idx {
log_debug!("No messages to compress (range invalid)");
animation_manager.stop_current().await;
return Ok(false);
}
let tokens_before = calculate_range_tokens(session, start_idx, end_idx)?;
let messages_to_compress = &session.session.messages[start_idx..=end_idx];
let chunks = super::semantic_chunking::chunk_messages(messages_to_compress);
let target_tokens = (tokens_before as f64 / target_ratio) as usize;
let selected = super::semantic_chunking::select_chunks_within_budget(&chunks, target_tokens);
let (critical_text, reference_text, context_chunks) = group_chunks_by_type(&selected);
let preserved_text = if !critical_text.is_empty() && !reference_text.is_empty() {
format!("{}\n{}", critical_text, reference_text)
} else if !critical_text.is_empty() {
critical_text
} else {
reference_text
};
let (should_compress, context_summary) =
ask_ai_decision_and_summary(session, config, &context_chunks, operation_rx).await?;
if !should_compress {
log_debug!("AI decided compression not beneficial at this point");
animation_manager.stop_current().await;
return Ok(false);
}
log_info!("AI decided to compress older conversation exchanges");
apply_compression(
session,
start_idx,
end_idx,
&preserved_text,
&context_summary,
tokens_before,
target_ratio,
)
.await?;
animation_manager.stop_current().await;
Ok(true)
}
async fn ask_ai_decision_and_summary(
session: &mut ChatSession,
config: &Config,
context_chunks: &[&super::semantic_chunking::SemanticChunk],
operation_rx: tokio::sync::watch::Receiver<bool>,
) -> Result<(bool, String)> {
let mut decision_prompt = String::from(
"Analyze the conversation history. Should older exchanges be compressed into a summary to save context space while preserving important information?\n\n\
Consider:\n\
- Are there repetitive or resolved topics that can be summarized?\n\
- Is there important context that must be preserved?\n\
- Would compression help focus on current topics?\n\n"
);
if !context_chunks.is_empty() {
decision_prompt.push_str(
"If YES, provide a structured summary that PRESERVES ALL CRITICAL CONTEXT for continuing:\n\
\n\
**USER INTENT** (1-2 sentences):\n\
What did the user ask for? What is the goal or objective?\n\
\n\
**PROGRESS** (2-3 sentences):\n\
What was completed? What is currently in progress? Include counts if applicable (e.g., 'Step 2 of 5 done').\n\
\n\
**CURRENT WORK** (2-3 sentences):\n\
What is being worked on RIGHT NOW? What was just being investigated or discussed?\n\
\n\
**KEY ENTITIES** (preserve exactly):\n\
- Resources: files, documents, URLs, or references being used\n\
- Names: specific terms, identifiers, or labels involved\n\
- Issues: any problems encountered and their status\n\
- Decisions: choices made with reasoning\n\
\n\
**NEXT STEPS** (1-2 sentences):\n\
What needs to happen next to continue?\n\
\n\
**Context chunks to analyze:**\n\n",
);
for chunk in context_chunks {
let relation_hint = match chunk.discourse_relation {
super::semantic_chunking::DiscourseRelation::Cause => "[REASONING]",
super::semantic_chunking::DiscourseRelation::Contrast => "[ALTERNATIVE]",
super::semantic_chunking::DiscourseRelation::Sequence => "[STEP]",
super::semantic_chunking::DiscourseRelation::Background => "[CONTEXT]",
super::semantic_chunking::DiscourseRelation::Elaboration => "[DETAIL]",
super::semantic_chunking::DiscourseRelation::None => "",
};
if relation_hint.is_empty() {
decision_prompt.push_str(&format!("- {}\n", chunk.content.trim()));
} else {
decision_prompt.push_str(&format!("{} {}\n", relation_hint, chunk.content.trim()));
}
}
decision_prompt.push_str(
"\n\n**Response format:**\n\
YES\n\
**USER INTENT**: [What the user asked for - 1-2 sentences]\n\
**PROGRESS**: [What was completed, what's in progress - include counts if applicable]\n\
**CURRENT WORK**: [What is being worked on RIGHT NOW]\n\
**KEY ENTITIES**:\n\
- Resources: [files, documents, URLs, or references being used]\n\
- Names: [specific terms, identifiers, or labels involved]\n\
- Issues: [any problems encountered and their status]\n\
- Decisions: [choices made with reasoning]\n\
**NEXT STEPS**: [What needs to happen next]\n\
\n\
**OPTIONAL: If specific file contexts are needed to continue, include them:**\n\
<context>\n\
filename:startline:endline\n\
filename:startline:endline\n\
</context>\n\
\n\
**Format requirements for file contexts:**\n\
- Use <context> tags around file references\n\
- Each line: filepath:number:number (no spaces)\n\
- Use paths from project root (src/main.rs not ./src/main.rs)\n\
- Line numbers must be positive, start ≤ end ≤ 10000\n\
- Maximum 5 file ranges\n\
- Only include files CRITICAL for continuing\n\
\n\
OR respond with 'NO' if compression is not beneficial.",
);
} else {
decision_prompt.push_str("Respond with ONLY 'YES' to compress or 'NO' to keep as-is.");
}
let mut messages = session.session.messages.clone();
messages.push(crate::session::Message {
role: "user".to_string(),
content: decision_prompt,
timestamp: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
cached: false,
tool_call_id: None,
name: None,
tool_calls: None,
images: None,
videos: None,
thinking: None,
id: None,
});
let decision_config = &config.compression.decision;
crate::log_debug!(
"Using compression decision model '{}' (max_tokens={}, temp={}, ignore_cost={})",
decision_config.model,
decision_config.max_tokens,
decision_config.temperature,
decision_config.ignore_cost
);
let params = crate::session::ChatCompletionWithValidationParams::new(
&messages,
&decision_config.model,
decision_config.temperature,
decision_config.top_p,
decision_config.top_k,
decision_config.max_tokens,
config,
)
.with_max_retries(decision_config.max_retries)
.with_chat_session(session)
.with_cancellation_token(operation_rx);
let response = crate::session::chat_completion_with_validation(params).await?;
let usage = response.exchange.usage;
let ignore_cost = decision_config.ignore_cost;
if !ignore_cost {
if let Some(ref u) = usage {
if let Some(cost) = u.cost {
session.session.info.total_cost += cost;
session.estimated_cost = session.session.info.total_cost;
log_debug!(
"Compression decision cost: ${:.5} (total: ${:.5})",
cost,
session.session.info.total_cost
);
}
}
} else {
log_debug!("Compression decision cost ignored (ignore_cost=true)");
}
let content = response.content.trim();
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
log_debug!("AI compression decision: NO (empty response)");
return Ok((false, String::new()));
}
let first_line = lines[0].trim().to_uppercase();
let decision = first_line.contains("YES");
if decision {
let summary = if lines.len() > 1 {
lines[1..].join("\n").trim().to_string()
} else {
String::new()
};
log_debug!(
"AI compression decision: YES with summary ({} chars)",
summary.len()
);
Ok((true, summary))
} else {
log_debug!("AI compression decision: NO");
Ok((false, String::new()))
}
}
async fn apply_compression(
session: &mut ChatSession,
start_idx: usize,
end_idx: usize,
preserved_text: &str,
context_summary: &str,
tokens_before: u64,
compression_ratio: f64,
) -> Result<()> {
let file_contexts = super::file_context::parse_file_contexts(context_summary);
let file_context_content = if !file_contexts.is_empty() {
crate::log_debug!(
"Compression: AI requested {} file context(s) for continuation",
file_contexts.len()
);
for (filepath, start, end) in &file_contexts {
crate::log_debug!(" - {} (lines {}-{})", filepath, start, end);
}
super::file_context::generate_file_context_content(&file_contexts)
} else {
String::new()
};
let compression_id = crate::mcp::dev::plan::compression::get_compression_id()
.unwrap_or_else(|| "unknown".to_string());
let compressed_entry = format_compressed_entry_with_context(
preserved_text,
context_summary,
&file_context_content,
compression_id,
);
let tokens_after = estimate_tokens(&compressed_entry) as u64;
let (messages_removed, _) = session.remove_messages_in_range(start_idx, end_idx)?;
session.insert_compressed_knowledge(start_idx, compressed_entry)?;
let tokens_saved = tokens_before.saturating_sub(tokens_after);
let metrics = crate::mcp::dev::plan::compression::CompressionMetrics::new(
messages_removed,
tokens_saved,
tokens_before,
);
crate::session::chat::cost_tracker::CostTracker::display_compression_result(
"Conversation",
&metrics,
);
session
.session
.info
.compression_stats
.add_conversation_compression(messages_removed, tokens_saved);
let estimated_future_turns =
estimate_future_turns(session, tokens_before as usize, compression_ratio);
let next_compression_at =
session.session.info.total_api_calls + estimated_future_turns as usize;
session
.session
.info
.next_conversation_compression_at_api_call = next_compression_at;
let api_calls_at_compression = session.session.info.total_api_calls;
session.session.info.predicted_turns_at_last_compression = estimated_future_turns;
session.session.info.api_calls_at_last_compression = api_calls_at_compression;
session.session.info.output_tokens_at_last_compression = session.session.info.output_tokens;
log_debug!(
"Self-tuning: Recorded prediction at compression #{} (API calls={}): predicted={:.1} remaining turns",
session.session.info.compression_stats.conversation_compressions,
api_calls_at_compression,
estimated_future_turns
);
log_debug!(
"Compression cooldown set: next_compression_at={} (current={}, estimated_turns={:.1})",
next_compression_at,
session.session.info.total_api_calls,
estimated_future_turns
);
let _ = crate::session::logger::log_compression_point(
&session.session.info.name,
"conversation",
messages_removed,
tokens_saved,
);
session.session.info.current_non_cached_tokens = 0;
session.session.info.current_total_tokens = 0;
session.session.info.last_cache_checkpoint_time = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
Ok(())
}
fn format_chunks_verbatim(chunks: &[&super::semantic_chunking::SemanticChunk]) -> String {
chunks
.iter()
.map(|c| c.content.trim())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join("\n- ")
}
fn group_chunks_by_type(
selected: &[super::semantic_chunking::SemanticChunk],
) -> (
String,
String,
Vec<&super::semantic_chunking::SemanticChunk>,
) {
let critical: Vec<_> = selected
.iter()
.filter(|c| matches!(c.chunk_type, super::semantic_chunking::ChunkType::Critical))
.collect();
let reference: Vec<_> = selected
.iter()
.filter(|c| matches!(c.chunk_type, super::semantic_chunking::ChunkType::Reference))
.collect();
let context: Vec<_> = selected
.iter()
.filter(|c| matches!(c.chunk_type, super::semantic_chunking::ChunkType::Context))
.collect();
let critical_text = format_chunks_verbatim(&critical);
let reference_text = format_chunks_verbatim(&reference);
(critical_text, reference_text, context)
}
fn format_compressed_entry_with_context(
preserved: &str,
context: &str,
file_context: &str,
compression_id: String,
) -> String {
let mut sections = Vec::new();
if !preserved.is_empty() {
sections.push(format!(
"**CRITICAL** (preserved verbatim):\n- {}",
preserved
));
}
if !context.is_empty() {
sections.push(format!("**CONTEXT**: {}", context));
}
if !file_context.is_empty() {
sections.push(format!(
"**FILE CONTEXT** (auto-expanded):\n{}",
file_context
));
}
format!(
"## Conversation Summary [COMPRESSED: {}]\n\n{}\n\n\
**Compression Info**:\n\
- ID: `{}`\n\
- Type: Semantic compression with file context\n\
---\n\
*Compressed using importance-based semantic chunking with automatic file context expansion.*",
compression_id,
sections.join("\n\n"),
compression_id
)
}
fn find_compression_range(
messages: &[crate::session::Message],
first_prompt_idx: Option<usize>,
) -> Result<(usize, usize)> {
let system_idx = messages
.iter()
.position(|m| m.role == "system")
.unwrap_or(0);
let conversation_indices: Vec<usize> = messages
.iter()
.enumerate()
.filter(|(_, m)| m.role == "user" || m.role == "assistant")
.map(|(idx, _)| idx)
.collect();
if conversation_indices.len() <= 4 {
return Ok((0, 0)); }
let preserve_count = 4;
let compress_count = conversation_indices.len() - preserve_count;
let start_idx = first_prompt_idx.unwrap_or(system_idx + 1);
let end_idx = conversation_indices[compress_count] - 1;
if start_idx >= end_idx {
return Ok((0, 0));
}
Ok((start_idx, end_idx))
}
fn calculate_range_tokens(session: &ChatSession, start_idx: usize, end_idx: usize) -> Result<u64> {
let mut total_tokens = 0u64;
if start_idx >= session.session.messages.len() {
return Err(anyhow::anyhow!("Invalid start_index in range"));
}
if end_idx >= session.session.messages.len() {
return Err(anyhow::anyhow!("Invalid end_index in range"));
}
for i in start_idx..=end_idx {
if let Some(message) = session.session.messages.get(i) {
let tokens = crate::session::estimate_message_tokens(message) as u64;
total_tokens += tokens;
}
}
Ok(total_tokens)
}
#[cfg(test)]
mod tests {
use super::find_compression_range;
use crate::session::Message;
use serde_json::json;
fn msg(role: &str) -> Message {
Message {
role: role.to_string(),
content: String::new(),
..Default::default()
}
}
#[test]
#[allow(clippy::vec_init_then_push)]
fn extends_range_to_include_tool_results() {
let mut messages = Vec::new();
messages.push(msg("system"));
messages.push(msg("user")); let mut assistant1 = msg("assistant"); assistant1.tool_calls = Some(json!([
{"id": "call_1", "type": "function", "function": {"name": "tool1"}}
]));
messages.push(assistant1);
let mut tool1 = msg("tool"); tool1.tool_call_id = Some("call_1".to_string());
messages.push(tool1);
messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant"));
let (start_idx, end_idx) = find_compression_range(&messages, None).unwrap();
assert_eq!(start_idx, 1);
assert_eq!(
end_idx, 5,
"Must include all messages before first preserved conversation message"
);
}
#[test]
#[allow(clippy::vec_init_then_push)]
fn extends_when_ending_on_assistant_with_tools() {
let mut messages = Vec::new();
messages.push(msg("system"));
messages.push(msg("user")); messages.push(msg("assistant"));
messages.push(msg("user")); let mut assistant_with_tools = msg("assistant"); assistant_with_tools.tool_calls = Some(json!([
{"id": "call_1", "type": "function", "function": {"name": "tool1"}}
]));
messages.push(assistant_with_tools);
let mut tool1 = msg("tool"); tool1.tool_call_id = Some("call_1".to_string());
messages.push(tool1);
messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant"));
let (start_idx, end_idx) = find_compression_range(&messages, None).unwrap();
assert_eq!(start_idx, 1);
assert_eq!(
end_idx, 5,
"Must include all messages (including tool results) before first preserved conversation message"
);
}
#[test]
#[allow(clippy::vec_init_then_push)]
fn handles_multiple_assistants_with_tools() {
let mut messages = Vec::new();
messages.push(msg("system"));
messages.push(msg("user"));
let mut assistant1 = msg("assistant"); assistant1.tool_calls = Some(json!([
{"id": "call_1", "type": "function", "function": {"name": "tool1"}}
]));
messages.push(assistant1);
let mut tool1 = msg("tool"); tool1.tool_call_id = Some("call_1".to_string());
messages.push(tool1);
let mut assistant2 = msg("assistant"); assistant2.tool_calls = Some(json!([
{"id": "call_2", "type": "function", "function": {"name": "tool2"}}
]));
messages.push(assistant2);
let mut tool2 = msg("tool"); tool2.tool_call_id = Some("call_2".to_string());
messages.push(tool2);
messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user"));
let (start_idx, end_idx) = find_compression_range(&messages, None).unwrap();
assert_eq!(start_idx, 1);
assert_eq!(
end_idx, 6,
"Must include all messages including tool results before first preserved"
);
}
#[test]
#[allow(clippy::vec_init_then_push)]
fn start_boundary_must_not_orphan_initial_tool_sequence() {
let mut messages = Vec::new();
messages.push(msg("system"));
let mut assistant_with_tools = msg("assistant"); assistant_with_tools.tool_calls = Some(json!([
{"id": "call_1", "type": "function", "function": {"name": "tool1"}}
]));
messages.push(assistant_with_tools);
let mut tool1 = msg("tool"); tool1.tool_call_id = Some("call_1".to_string());
messages.push(tool1);
messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant")); let (start_idx, end_idx) = find_compression_range(&messages, Some(3)).unwrap();
assert_eq!(
start_idx, 3,
"start_idx must equal first_prompt_idx (INCLUSIVE boundary)"
);
assert!(
end_idx >= 4,
"range should start compressing only after first_prompt_idx"
);
}
#[test]
#[allow(clippy::vec_init_then_push)]
fn start_boundary_must_not_orphan_initial_tool_sequence_duplicate() {
let mut messages = Vec::new();
messages.push(msg("system"));
let mut assistant_with_tools = msg("assistant"); assistant_with_tools.tool_calls = Some(json!([
{"id": "call_1", "type": "function", "function": {"name": "tool1"}}
]));
messages.push(assistant_with_tools);
let mut tool1 = msg("tool"); tool1.tool_call_id = Some("call_1".to_string());
messages.push(tool1);
messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant")); let (start_idx, end_idx) = find_compression_range(&messages, Some(3)).unwrap();
assert_eq!(
start_idx, 3,
"start_idx must equal first_prompt_idx (INCLUSIVE boundary)"
);
assert!(
end_idx >= 4,
"range should start compressing only after first_prompt_idx"
);
}
#[test]
#[allow(clippy::vec_init_then_push)]
#[allow(clippy::needless_range_loop)]
fn calculate_range_tokens_must_match_removal_range() {
use crate::session::estimate_message_tokens;
let mut messages = Vec::new();
messages.push(msg("system"));
let mut msg1 = msg("user"); msg1.content = "x".repeat(100); messages.push(msg1);
let mut msg2 = msg("assistant"); msg2.content = "y".repeat(200); messages.push(msg2);
let mut msg3 = msg("user"); msg3.content = "z".repeat(300); messages.push(msg3);
let mut msg4 = msg("assistant"); msg4.content = "a".repeat(400); messages.push(msg4);
messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant"));
let (start_idx, end_idx) = find_compression_range(&messages, None).unwrap();
assert!(start_idx < end_idx, "Range must be valid");
let mut expected_tokens = 0u64;
for i in (start_idx + 1)..=end_idx {
expected_tokens += estimate_message_tokens(&messages[i]) as u64;
}
let mut chunked_tokens = 0u64;
for i in start_idx..=end_idx {
chunked_tokens += estimate_message_tokens(&messages[i]) as u64;
}
if expected_tokens != chunked_tokens {
let start_msg_tokens = estimate_message_tokens(&messages[start_idx]) as u64;
assert_eq!(
chunked_tokens - expected_tokens,
start_msg_tokens,
"The difference should be exactly the tokens in start_idx message"
);
}
}
#[test]
#[allow(clippy::needless_range_loop)]
fn bug_proof_token_mismatch_causes_zero_savings() {
use crate::session::estimate_message_tokens;
let mut messages = Vec::new();
messages.push(msg("system"));
let mut large_msg = msg("user"); large_msg.content = "x".repeat(4000); messages.push(large_msg);
let mut small1 = msg("assistant"); small1.content = "y".repeat(40); messages.push(small1);
let mut small2 = msg("user"); small2.content = "z".repeat(40); messages.push(small2);
let mut small3 = msg("assistant"); small3.content = "a".repeat(40); messages.push(small3);
messages.push(msg("user")); messages.push(msg("assistant")); messages.push(msg("user")); messages.push(msg("assistant"));
let (start_idx, end_idx) = find_compression_range(&messages, None).unwrap();
assert_eq!(start_idx, 1); assert_eq!(end_idx, 4);
let mut tokens_counted_by_function = 0u64;
for i in (start_idx + 1)..=end_idx {
tokens_counted_by_function += estimate_message_tokens(&messages[i]) as u64;
}
let mut tokens_in_chunking = 0u64;
for i in start_idx..=end_idx {
tokens_in_chunking += estimate_message_tokens(&messages[i]) as u64;
}
let large_msg_tokens = estimate_message_tokens(&messages[start_idx]) as u64;
println!("Large message tokens: {}", large_msg_tokens);
println!("Tokens counted by function: {}", tokens_counted_by_function);
println!("Tokens in chunking: {}", tokens_in_chunking);
assert_eq!(
tokens_in_chunking,
tokens_counted_by_function + large_msg_tokens,
"Chunking includes the large message that wasn't counted!"
);
assert!(
large_msg_tokens > tokens_counted_by_function,
"Large message ({}) should have more tokens than all small messages combined ({})",
large_msg_tokens,
tokens_counted_by_function
);
}
#[test]
fn bug_proof_insufficient_compression_triggers_loop() {
let full_context_tokens = 55_000u64;
let threshold = 50_000u64;
let non_compressible_tokens = 52_000u64; let compressible_tokens = 3_000u64;
let compression_ratio = 2.0;
assert_eq!(
full_context_tokens,
non_compressible_tokens + compressible_tokens
);
let compressed_tokens = (compressible_tokens as f64 / compression_ratio) as u64;
let tokens_after_compression = non_compressible_tokens + compressed_tokens;
assert!(
tokens_after_compression > threshold,
"Compression didn't bring context below threshold: {} > {}",
tokens_after_compression,
threshold
);
}
#[test]
fn bug_proof_compression_should_verify_benefit() {
let threshold = 50_000u64;
let system_tokens = 5_000u64;
let tools_tokens = 30_000u64;
let recent_4_messages_tokens = 20_000u64;
let old_compressible_tokens = 2_000u64;
let non_compressible = system_tokens + tools_tokens + recent_4_messages_tokens;
let full_context = non_compressible + old_compressible_tokens;
assert!(full_context > threshold, "Triggers compression");
let best_case_compressed = old_compressible_tokens / 10;
let best_case_result = non_compressible + best_case_compressed;
assert!(
best_case_result > threshold,
"Non-compressible portion alone exceeds threshold: {} > {}",
best_case_result,
threshold
);
}
#[test]
fn test_cooldown_prevents_premature_recompression() {
let current_api_calls = 10;
let estimated_turns = 20.0;
let next_compression_at = current_api_calls + estimated_turns as usize;
assert_eq!(
next_compression_at, 30,
"Next compression should be at call 30"
);
let current_at_15 = 15;
assert!(
current_at_15 < next_compression_at,
"Cooldown should block at call 15: {} < {}",
current_at_15,
next_compression_at
);
let current_at_29 = 29;
assert!(
current_at_29 < next_compression_at,
"Cooldown should still block at call 29: {} < {}",
current_at_29,
next_compression_at
);
let current_at_30 = 30;
assert!(
current_at_30 >= next_compression_at,
"Cooldown should pass at call 30: {} >= {}",
current_at_30,
next_compression_at
);
let current_at_35 = 35;
assert!(
current_at_35 >= next_compression_at,
"Compression should be allowed at call 35: {} >= {}",
current_at_35,
next_compression_at
);
}
#[test]
fn test_cooldown_default_allows_first_compression() {
let next_compression_at = 0; let current_api_calls = 1;
assert!(
current_api_calls >= next_compression_at,
"First compression should be allowed: {} >= {}",
current_api_calls,
next_compression_at
);
let current_at_0 = 0;
assert!(
current_at_0 >= next_compression_at,
"Compression should be allowed even at call 0: {} >= {}",
current_at_0,
next_compression_at
);
}
#[test]
fn test_cooldown_calculation_with_varying_estimates() {
let current = 10;
let estimated_short = 5.0;
let next_short = current + estimated_short as usize;
assert_eq!(next_short, 15, "Short estimate: next at 15");
let estimated_medium = 20.0;
let next_medium = current + estimated_medium as usize;
assert_eq!(next_medium, 30, "Medium estimate: next at 30");
let estimated_long = 50.0;
let next_long = current + estimated_long as usize;
assert_eq!(next_long, 60, "Long estimate: next at 60");
assert!(next_short < next_medium, "Short cooldown < medium cooldown");
assert!(next_medium < next_long, "Medium cooldown < long cooldown");
}
#[test]
fn test_estimate_physical_ceiling_is_headroom_over_growth() {
let current_tokens = 100_000.0_f64;
let compression_ratio = 2.5_f64;
let compressed = current_tokens / compression_ratio; let headroom = current_tokens - compressed;
let growth_rate = 5_000.0_f64; let ceiling = headroom / growth_rate; assert_eq!(ceiling, 12.0);
let ceiling_fast = headroom / 10_000.0_f64; assert!(ceiling_fast < ceiling, "faster growth → lower ceiling");
let compressed_aggressive = current_tokens / 4.0; let headroom_aggressive = current_tokens - compressed_aggressive; let ceiling_aggressive = headroom_aggressive / growth_rate; assert!(
ceiling_aggressive > ceiling,
"more compression → more headroom → higher ceiling"
);
}
#[test]
fn test_estimate_symmetry_is_api_calls_so_far() {
let api_calls = 20.0_f64;
let physical_ceiling = 30.0_f64;
let estimate = physical_ceiling.min(api_calls);
assert_eq!(
estimate, api_calls,
"symmetry wins when smaller than ceiling"
);
let api_calls_large = 50.0_f64;
let estimate2 = physical_ceiling.min(api_calls_large);
assert_eq!(
estimate2, physical_ceiling,
"ceiling wins when smaller than symmetry"
);
}
#[test]
fn test_estimate_zero_api_calls_caps_physical_ceiling() {
let current_tokens = 100_000.0_f64;
let compression_ratio = 2.5_f64;
let compressed = current_tokens / compression_ratio;
let headroom = current_tokens - compressed;
let growth_rate = (0.0_f64 / 1.0_f64).max(1.0); let raw_ceiling = headroom / growth_rate; assert_eq!(raw_ceiling, 60_000.0);
let estimate = raw_ceiling.min(100.0);
assert_eq!(estimate, 100.0, "cold-start ceiling capped at 100, not 60k");
assert!(estimate >= 5.0, "always at least 5");
}
#[test]
fn test_estimate_growth_rate_from_measured_output() {
let cases = [
(10.0_f64, 50_000.0_f64, 5_000.0_f64), (1.0, 3_000.0, 3_000.0), (0.0, 0.0, 1.0), ];
for (api_calls, output_tokens, expected) in cases {
let rate = (output_tokens / api_calls.max(1.0)).max(1.0);
assert_eq!(
rate, expected,
"api_calls={api_calls}, output={output_tokens}"
);
}
}
#[test]
fn test_self_tuning_direct_ratio_no_blending() {
let predicted = 20.0_f64;
let actual = 10.0_f64;
let ratio = (actual / predicted).clamp(0.25, 4.0);
assert_eq!(ratio, 0.5, "underestimated → ratio < 1");
let ratio2 = (30.0_f64 / 10.0_f64).clamp(0.25, 4.0);
assert_eq!(ratio2, 3.0, "overestimated → ratio > 1");
let ratio_extreme_low = (1.0_f64 / 100.0_f64).clamp(0.25, 4.0);
assert_eq!(ratio_extreme_low, 0.25, "extreme low clamped");
let ratio_extreme_high = (100.0_f64 / 1.0_f64).clamp(0.25, 4.0);
assert_eq!(ratio_extreme_high, 4.0, "extreme high clamped");
}
#[test]
fn test_self_tuning_neutral_when_no_prior_compression() {
let compressions = 0_usize;
let result = if compressions == 0 { 1.0_f64 } else { 0.5 };
assert_eq!(result, 1.0, "no prior data → neutral multiplier");
}
#[test]
fn test_estimate_end_to_end_symmetry_wins() {
let api_calls = 10.0_f64;
let output_tokens = 50_000.0_f64;
let current_tokens = 100_000.0_f64;
let compression_ratio = 2.5_f64;
let growth_rate = (output_tokens / api_calls).max(1.0); let headroom = current_tokens - current_tokens / compression_ratio; let ceiling = headroom / growth_rate; let estimate = ceiling.min(api_calls);
assert_eq!(ceiling, 12.0);
assert_eq!(estimate, 10.0, "symmetry (10) wins over ceiling (12)");
assert!(estimate >= 5.0);
}
#[test]
fn test_estimate_end_to_end_ceiling_wins() {
let api_calls = 30.0_f64;
let output_tokens = 300_000.0_f64;
let current_tokens = 100_000.0_f64;
let compression_ratio = 2.5_f64;
let growth_rate = (output_tokens / api_calls).max(1.0); let headroom = current_tokens - current_tokens / compression_ratio; let ceiling = headroom / growth_rate; let estimate = ceiling.min(api_calls);
assert_eq!(ceiling, 6.0);
assert_eq!(estimate, 6.0, "ceiling (6) wins over symmetry (30)");
assert!(estimate >= 5.0);
}
#[test]
fn test_estimate_incremental_growth_rate_after_compression() {
let total_api_calls: usize = 25;
let total_output_tokens: u64 = 210_000;
let api_calls_at_last_compression: usize = 20;
let output_tokens_at_last_compression: u64 = 200_000;
let calls_since = (total_api_calls - api_calls_at_last_compression).max(1) as f64; let output_since =
total_output_tokens.saturating_sub(output_tokens_at_last_compression) as f64; let incremental_rate = (output_since / calls_since).max(1.0); assert_eq!(
incremental_rate, 2_000.0,
"incremental rate reflects post-compression phase"
);
let lifetime_rate = (total_output_tokens as f64 / total_api_calls as f64).max(1.0); assert_eq!(
lifetime_rate, 8_400.0,
"lifetime rate is inflated by heavy early phase"
);
let current_tokens = 100_000.0_f64;
let compression_ratio = 2.5_f64;
let headroom = current_tokens - current_tokens / compression_ratio;
let ceiling_incremental = headroom / incremental_rate; let ceiling_lifetime = headroom / lifetime_rate;
assert!(
ceiling_incremental > ceiling_lifetime,
"incremental ceiling ({ceiling_incremental}) > lifetime ceiling ({ceiling_lifetime}): \
stale lifetime rate would trigger re-compression 4x too soon"
);
assert_eq!(ceiling_incremental, 30.0);
}
#[test]
fn test_estimate_growth_rate_falls_back_to_lifetime_before_first_compression() {
let compressions: usize = 0;
let total_api_calls = 10_usize;
let total_output_tokens: u64 = 50_000;
let api_calls_at_last_compression: usize = 0;
let output_tokens_at_last_compression: u64 = 0;
let growth_rate = if compressions > 0 {
let calls_since = (total_api_calls - api_calls_at_last_compression).max(1) as f64;
let output_since =
total_output_tokens.saturating_sub(output_tokens_at_last_compression) as f64;
(output_since / calls_since).max(1.0)
} else {
(total_output_tokens as f64 / total_api_calls.max(1) as f64).max(1.0)
};
assert_eq!(
growth_rate, 5_000.0,
"lifetime fallback: 50k / 10 calls = 5k/call"
);
}
#[test]
fn test_estimate_incremental_rate_single_call_since_compression() {
let total_api_calls: usize = 21;
let total_output_tokens: u64 = 205_000;
let api_calls_at_last_compression: usize = 20;
let output_tokens_at_last_compression: u64 = 200_000;
let calls_since = (total_api_calls - api_calls_at_last_compression).max(1) as f64; let output_since =
total_output_tokens.saturating_sub(output_tokens_at_last_compression) as f64; let rate = (output_since / calls_since).max(1.0);
assert_eq!(
rate, 5_000.0,
"single post-compression call measured correctly"
);
}
#[test]
fn test_estimate_incremental_rate_saturating_sub_prevents_underflow() {
let total_output_tokens: u64 = 1_000;
let output_tokens_at_last_compression: u64 = 5_000; let output_since = total_output_tokens.saturating_sub(output_tokens_at_last_compression); assert_eq!(output_since, 0, "saturating_sub: no underflow");
let rate = (output_since as f64 / 1.0_f64).max(1.0);
assert_eq!(rate, 1.0, "floors at 1.0, no panic");
}
}