use anyhow::Result;
use std::fmt::Write;
use std::path::Path;
use std::time::Duration;
use crate::logging;
use crate::models::{
CacheControl, ContentBlock, Message, MessageRequest, SystemBlock, SystemPrompt,
context_window_for_model,
};
use zagens_core::compaction::CompactionConfig;
use super::plan::plan_compaction;
use super::prune::{prune_tool_results, tail_chars, truncate_chars};
use super::tokens::{estimate_tokens, should_compact};
fn xml_escape_inline(s: &str) -> String {
s.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
}
use super::{
CACHE_ALIGNED_SUMMARY_CONTEXT_BUDGET_PERCENT, KEEP_RECENT_MESSAGES,
LARGE_CONTEXT_SUMMARY_INPUT_HEAD_CHARS, LARGE_CONTEXT_SUMMARY_INPUT_MAX_CHARS,
LARGE_CONTEXT_SUMMARY_INPUT_TAIL_CHARS, LARGE_CONTEXT_SUMMARY_MAX_TOKENS,
LARGE_CONTEXT_SUMMARY_TEXT_SNIPPET_CHARS, LARGE_CONTEXT_SUMMARY_TOOL_RESULT_SNIPPET_CHARS,
LARGE_CONTEXT_WINDOW_TOKENS, SUMMARY_INPUT_HEAD_CHARS, SUMMARY_INPUT_MAX_CHARS,
SUMMARY_INPUT_TAIL_CHARS, SUMMARY_TEXT_SNIPPET_CHARS, SUMMARY_TOOL_RESULT_SNIPPET_CHARS,
};
#[derive(Debug, Clone, Copy)]
pub(crate) struct SummaryInputLimits {
pub(crate) text_snippet_chars: usize,
pub(crate) tool_result_snippet_chars: usize,
pub(crate) input_max_chars: usize,
pub(crate) input_head_chars: usize,
pub(crate) input_tail_chars: usize,
pub(crate) max_tokens: u32,
pub(crate) word_limit: usize,
}
pub(crate) fn summary_input_limits_for_model(model: &str) -> SummaryInputLimits {
let is_large_context =
context_window_for_model(model).is_some_and(|window| window >= LARGE_CONTEXT_WINDOW_TOKENS);
if is_large_context {
SummaryInputLimits {
text_snippet_chars: LARGE_CONTEXT_SUMMARY_TEXT_SNIPPET_CHARS,
tool_result_snippet_chars: LARGE_CONTEXT_SUMMARY_TOOL_RESULT_SNIPPET_CHARS,
input_max_chars: LARGE_CONTEXT_SUMMARY_INPUT_MAX_CHARS,
input_head_chars: LARGE_CONTEXT_SUMMARY_INPUT_HEAD_CHARS,
input_tail_chars: LARGE_CONTEXT_SUMMARY_INPUT_TAIL_CHARS,
max_tokens: LARGE_CONTEXT_SUMMARY_MAX_TOKENS,
word_limit: 900,
}
} else {
SummaryInputLimits {
text_snippet_chars: SUMMARY_TEXT_SNIPPET_CHARS,
tool_result_snippet_chars: SUMMARY_TOOL_RESULT_SNIPPET_CHARS,
input_max_chars: SUMMARY_INPUT_MAX_CHARS,
input_head_chars: SUMMARY_INPUT_HEAD_CHARS,
input_tail_chars: SUMMARY_INPUT_TAIL_CHARS,
max_tokens: 1_024,
word_limit: 500,
}
}
}
pub struct CompactionResult {
pub messages: Vec<Message>,
pub summary_prompt: Option<SystemPrompt>,
#[allow(dead_code)]
pub removed_messages: Vec<Message>,
pub retries_used: u32,
pub artifact: Option<zagens_core::compaction::CompactionArtifact>,
}
pub(crate) fn is_transient_error(e: &anyhow::Error) -> bool {
let category = crate::error_taxonomy::classify_error_message(&e.to_string());
matches!(
category,
crate::error_taxonomy::ErrorCategory::Network
| crate::error_taxonomy::ErrorCategory::RateLimit
| crate::error_taxonomy::ErrorCategory::Timeout
)
}
pub async fn compact_messages_safe(
client: &dyn crate::llm_client::LlmClient,
messages: &[Message],
config: &CompactionConfig,
workspace: Option<&Path>,
external_pins: Option<&[usize]>,
external_working_set_paths: Option<&[String]>,
) -> Result<CompactionResult> {
const MAX_RETRIES: u32 = 3;
const BASE_DELAY_MS: u64 = 1000;
let mut pruned_messages = messages.to_vec();
let pruned_bytes = prune_tool_results(&mut pruned_messages, KEEP_RECENT_MESSAGES);
let compaction_input: &[Message] = if pruned_bytes > 0 {
logging::info(format!(
"Local tool-result prune saved {pruned_bytes} bytes before LLM compaction"
));
let was_over_threshold = should_compact(
messages,
config,
workspace,
external_pins,
external_working_set_paths,
);
let now_under_threshold = !should_compact(
&pruned_messages,
config,
workspace,
external_pins,
external_working_set_paths,
);
if was_over_threshold && now_under_threshold {
return Ok(CompactionResult {
messages: pruned_messages,
summary_prompt: None,
removed_messages: Vec::new(),
retries_used: 0,
artifact: None,
});
}
&pruned_messages
} else {
messages
};
let mut last_error: Option<anyhow::Error> = None;
for attempt in 0..MAX_RETRIES {
if attempt > 0 {
let delay = Duration::from_millis(BASE_DELAY_MS * (1 << (attempt - 1)));
tokio::time::sleep(delay).await;
}
match compact_messages(
client,
compaction_input,
config,
workspace,
external_pins,
external_working_set_paths,
)
.await
{
Ok((msgs, prompt, removed, artifact)) => {
return Ok(CompactionResult {
messages: msgs,
summary_prompt: prompt,
removed_messages: removed,
retries_used: attempt,
artifact,
});
}
Err(e) => {
if !is_transient_error(&e) {
return Err(e);
}
last_error = Some(e);
}
}
}
Err(last_error
.unwrap_or_else(|| anyhow::anyhow!("Compaction failed after {MAX_RETRIES} retries")))
}
pub(crate) fn read_workspace_anchors(workspace: Option<&Path>) -> Vec<String> {
let Some(ws) = workspace else {
return Vec::new();
};
let anchors_path = zagens_config::workspace_meta_file_read(ws, "anchors.md");
let Ok(content) = std::fs::read_to_string(anchors_path) else {
return Vec::new();
};
content
.split("\n---\n")
.map(str::trim)
.filter(|anchor| !anchor.is_empty())
.map(ToOwned::to_owned)
.collect()
}
pub(crate) fn anchor_summary_section(workspace: Option<&Path>) -> String {
let anchors = read_workspace_anchors(workspace);
if anchors.is_empty() {
return String::new();
}
let mut section = String::from(
"## Pinned Facts (User Anchors)\n\n\
The following facts were explicitly anchored by the user with `/anchor`. \
Preserve them across compaction cycles.\n\n",
);
for anchor in anchors {
let _ = writeln!(section, "- {anchor}");
}
section.push_str("\n---\n\n");
section
}
pub async fn compact_messages(
client: &dyn crate::llm_client::LlmClient,
messages: &[Message],
config: &CompactionConfig,
workspace: Option<&Path>,
external_pins: Option<&[usize]>,
external_working_set_paths: Option<&[String]>,
) -> Result<(
Vec<Message>,
Option<SystemPrompt>,
Vec<Message>,
Option<zagens_core::compaction::CompactionArtifact>,
)> {
if messages.is_empty() {
return Ok((Vec::new(), None, Vec::new(), None));
}
let plan = plan_compaction(
messages,
workspace,
KEEP_RECENT_MESSAGES,
external_pins,
external_working_set_paths,
);
if plan.summarize_indices.is_empty() {
return Ok((messages.to_vec(), None, Vec::new(), None));
}
let to_summarize: Vec<Message> = plan
.summarize_indices
.iter()
.map(|&idx| messages[idx].clone())
.collect();
let summary = create_summary(client, &to_summarize, &config.model).await?;
let workflow_context = extract_workflow_context(&to_summarize, workspace);
let anchors_section = anchor_summary_section(workspace);
let summary_safe = xml_escape_inline(&summary);
let workflow_safe = xml_escape_inline(&workflow_context);
let summary_block = SystemBlock {
block_type: "text".to_string(),
text: format!(
"{anchors_section}\
## 📋 Conversation Summary (Auto-Generated)\n\n\
<compaction_summary>\n{summary_safe}\n</compaction_summary>\n\n\
---\n\n\
## 🔍 Workflow Context\n\n\
<workflow_context>\n{workflow_safe}\n</workflow_context>\n\n\
---\n\n\
## 💡 What to Do Next\n\n\
You have just resumed from a context compaction. The conversation above was summarized to save space. \
Review the summary and workflow context, then continue helping the user with their task. \
If you need more details about the summarized portion, ask the user to clarify.\n\n\
---\n\n\
Pinned messages follow:"
),
cache_control: if config.cache_summary {
Some(CacheControl {
cache_type: "ephemeral".to_string(),
})
} else {
None
},
};
let pinned_messages: Vec<Message> = messages
.iter()
.enumerate()
.filter_map(|(idx, msg)| plan.pinned_indices.contains(&idx).then_some(msg.clone()))
.collect();
let artifact = if !plan.summarize_indices.is_empty() {
use zagens_core::engine::token_estimate::TokenEstimator;
let replaced_start = plan.summarize_indices[0];
let replaced_end = plan.summarize_indices[plan.summarize_indices.len() - 1] + 1;
let replaced_messages_json =
serde_json::to_string(&to_summarize).unwrap_or_else(|_| "[]".to_string());
let est = TokenEstimator;
let original_tokens: u32 = to_summarize
.iter()
.map(|m| est.estimate_message(m, false) as u32)
.sum();
let summary_tokens = est.estimate_text(&summary) as u32;
Some(zagens_core::compaction::CompactionArtifact {
id: uuid::Uuid::new_v4().to_string(),
session_id: String::new(), created_at_ms: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_millis() as i64)
.unwrap_or(0),
replaced_start,
replaced_end,
replaced_messages_json,
summary: summary.clone(),
original_tokens,
summary_tokens,
})
} else {
None
};
Ok((
pinned_messages,
Some(SystemPrompt::Blocks(vec![summary_block])),
to_summarize,
artifact,
))
}
pub(crate) async fn create_summary(
client: &dyn crate::llm_client::LlmClient,
messages: &[Message],
model: &str,
) -> Result<String> {
let limits = summary_input_limits_for_model(model);
let used_cache_aligned = should_use_cache_aligned_summary(model, messages);
let request = if used_cache_aligned {
build_cache_aligned_summary_request(model, messages, limits)
} else {
build_formatted_summary_request(model, messages, limits)
};
let response = client.create_message(request).await?;
crate::cost_status::report(&response.model, &response.usage);
log_summary_cache_telemetry(used_cache_aligned, &response.usage);
let summary = response
.content
.iter()
.filter_map(|block| match block {
ContentBlock::Text { text, .. } => Some(text.clone()),
_ => None,
})
.collect::<Vec<_>>()
.join("\n");
Ok(summary)
}
pub(crate) fn summary_cache_hit_percent(cache_hit: u32, input_tokens: u32) -> f64 {
if input_tokens > 0 {
(f64::from(cache_hit) * 100.0) / f64::from(input_tokens)
} else {
0.0
}
}
pub(crate) fn log_summary_cache_telemetry(used_cache_aligned: bool, usage: &crate::models::Usage) {
let path = if used_cache_aligned {
"cache_aligned"
} else {
"fallback"
};
let cache_hit = usage.prompt_cache_hit_tokens.unwrap_or(0);
let cache_miss = usage.prompt_cache_miss_tokens.unwrap_or(0);
let cache_hit_pct = summary_cache_hit_percent(cache_hit, usage.input_tokens);
tracing::debug!(
target: "compaction",
"compaction summary call: path={} prompt_tokens={} cache_hit_tokens={} cache_miss_tokens={} cache_hit_pct={:.1}",
path,
usage.input_tokens,
cache_hit,
cache_miss,
cache_hit_pct,
);
}
pub(crate) fn should_use_cache_aligned_summary(model: &str, messages: &[Message]) -> bool {
let Some(window) = context_window_for_model(model) else {
return false;
};
if window < LARGE_CONTEXT_WINDOW_TOKENS {
return false;
}
let budget = usize::try_from(window).unwrap_or(usize::MAX)
* CACHE_ALIGNED_SUMMARY_CONTEXT_BUDGET_PERCENT
/ 100;
let summary_prompt_tokens = 512usize;
estimate_tokens(messages).saturating_add(summary_prompt_tokens) <= budget
}
pub(crate) fn summary_instruction(word_limit: usize) -> String {
format!(
"Summarize the conversation above in a concise but comprehensive way. \
Preserve key information, decisions made, exact file paths, commands, \
errors, and tool-result facts needed to continue the work. \
Tool outputs may be abbreviated only when they are repetitive. \
Keep it under {word_limit} words."
)
}
pub(crate) fn build_cache_aligned_summary_request(
model: &str,
messages: &[Message],
limits: SummaryInputLimits,
) -> MessageRequest {
let mut request_messages = messages.to_vec();
request_messages.push(Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: summary_instruction(limits.word_limit),
cache_control: None,
}],
});
MessageRequest {
model: model.to_string(),
messages: request_messages,
max_tokens: limits.max_tokens,
system: None,
tools: None,
tool_choice: None,
metadata: None,
thinking: None,
reasoning_effort: None,
stream: Some(false),
temperature: Some(0.3),
top_p: None,
}
}
pub(crate) fn build_formatted_summary_request(
model: &str,
messages: &[Message],
limits: SummaryInputLimits,
) -> MessageRequest {
let mut conversation_text = String::new();
for msg in messages {
let role = if msg.role == "user" {
"User"
} else {
"Assistant"
};
for block in &msg.content {
match block {
ContentBlock::Text { text, .. } => {
let snippet = truncate_chars(text, limits.text_snippet_chars);
let _ = write!(conversation_text, "{role}: {snippet}\n\n");
}
ContentBlock::ToolUse { name, .. } => {
let _ = write!(conversation_text, "{role}: [Used tool: {name}]\n\n");
}
ContentBlock::ToolResult { content, .. } => {
let snippet = truncate_chars(content, limits.tool_result_snippet_chars);
let _ = write!(conversation_text, "Tool result: {}\n\n", snippet);
}
ContentBlock::Thinking { .. } => {
}
ContentBlock::ServerToolUse { .. }
| ContentBlock::ToolSearchToolResult { .. }
| ContentBlock::CodeExecutionToolResult { .. } => {}
}
}
}
let conversation_chars = conversation_text.chars().count();
if conversation_chars > limits.input_max_chars {
let head = truncate_chars(&conversation_text, limits.input_head_chars).to_string();
let tail = tail_chars(&conversation_text, limits.input_tail_chars);
let omitted = conversation_chars
.saturating_sub(head.chars().count())
.saturating_sub(tail.chars().count());
conversation_text =
format!("{head}\n\n[... {omitted} characters omitted before summary ...]\n\n{tail}");
}
MessageRequest {
model: model.to_string(),
messages: vec![Message {
role: "user".to_string(),
content: vec![ContentBlock::Text {
text: format!(
"{}\n\n---\n\n{conversation_text}",
summary_instruction(limits.word_limit)
),
cache_control: None,
}],
}],
max_tokens: limits.max_tokens,
system: Some(SystemPrompt::Text(
"You are a helpful assistant that creates concise conversation summaries.".to_string(),
)),
tools: None,
tool_choice: None,
metadata: None,
thinking: None,
reasoning_effort: None,
stream: Some(false),
temperature: Some(0.3),
top_p: None,
}
}
pub(crate) fn extract_workflow_context(messages: &[Message], workspace: Option<&Path>) -> String {
let mut files_touched: Vec<String> = Vec::new();
let mut tools_used: Vec<String> = Vec::new();
let mut tasks_identified: Vec<String> = Vec::new();
for msg in messages {
for block in &msg.content {
match block {
ContentBlock::ToolUse { name, input, .. } => {
tools_used.push(name.clone());
if let Some(path) = extract_path_from_input(input)
&& !files_touched.contains(&path)
{
files_touched.push(path);
}
}
ContentBlock::Text { text, .. }
if (text.contains("TODO") || text.contains("task") || text.contains("need to")) => {
let task = truncate_chars(text, 200).to_string();
if !tasks_identified.contains(&task) {
tasks_identified.push(task);
}
}
_ => {}
}
}
}
let mut context = String::new();
if !files_touched.is_empty() {
context.push_str("**Files Modified/Read:**\n");
for file in &files_touched {
if let Some(ws) = workspace {
let relative = Path::new(file)
.strip_prefix(ws)
.unwrap_or(Path::new(file))
.display();
context.push_str(&format!("- `{}`\n", relative));
} else {
context.push_str(&format!("- `{}`\n", file));
}
}
context.push('\n');
}
if !tools_used.is_empty() {
context.push_str("**Tools Used:** ");
context.push_str(&tools_used.join(", "));
context.push_str("\n\n");
}
if !tasks_identified.is_empty() {
context.push_str("**Tasks/TODOs Identified:**\n");
for task in &tasks_identified {
context.push_str(&format!("- {}\n", task));
}
context.push('\n');
}
if context.is_empty() {
context.push_str("No specific workflow context detected. Continue assisting the user with their current task.\n");
}
context
}
pub(crate) fn extract_path_from_input(input: &serde_json::Value) -> Option<String> {
for key in ["path", "file", "file_path", "filename"] {
if let Some(path) = input.get(key).and_then(|v| v.as_str()) {
return Some(path.to_string());
}
}
if let Some(obj) = input.as_object() {
for (_, value) in obj {
if let Some(path) = value.as_str()
&& (path.contains('/') || path.contains('\\') || path.contains('.'))
{
return Some(path.to_string());
}
}
}
None
}