atomcode_core/ctx/render.rs
1//! Default render & compression-plan policy for atomcode ctx.
2//!
3//! [`build_messages`], [`needs_compression`], and
4//! [`build_compression_content`] implement the out-of-the-box context
5//! behavior. `DefaultCtx` is a thin wrapper over them; `OllamaCtx`
6//! reuses `build_messages` / `build_compression_content` and overrides
7//! only the compression threshold (early trigger).
8//!
9//! Implementations wanting different behavior (different thresholds,
10//! different compression content format, different cold-zone layout)
11//! write their own `impl CtxBuilder` without touching this module.
12//!
13//! All functions here are free functions taking `&Conversation`,
14//! keeping `Conversation` as a pure data container — no render logic
15//! leaks back into the data layer.
16
17use crate::conversation::message::{self, Message, MessageContent, Role};
18use crate::conversation::{ContextStats, Conversation, KEEP_MESSAGES};
19
20/// Append model-specific behavioral directives to a system prompt.
21///
22/// Previously scattered as `if model_id.contains(...)` branches inside
23/// `agent::prompt::build_system_prompt`. Moved here so per-model prompt
24/// customization lives in the ctx layer alongside other per-model logic
25/// (compression threshold, tool-output cap, etc).
26///
27/// `model_id` MUST already be lowercased by the caller (matching the
28/// original `provider.model.to_lowercase()` check).
29///
30/// Currently handles two groups:
31/// - CN language lock: minimax / qwen / deepseek / kimi models default
32/// to English reasoning even when the user speaks Chinese; one gentle
33/// line nudges user-visible output back to zh-CN.
34/// - MiniMax thinking discipline: MiniMax M2 has no reasoning_effort
35/// knob and defaults to extremely verbose `<think>` blocks; a
36/// system-reminder near the tail caps it to ≤3 sentences via recency
37/// bias.
38///
39/// Impls that don't want these (e.g. a hypothetical ClaudeCtx) simply
40/// don't call this function — the hooks live in each `build_messages`
41/// impl, not in `ctx::render::build_messages`.
42pub(crate) fn apply_model_directives(system_prompt: &str, model_id: &str) -> String {
43 let mut out = String::with_capacity(system_prompt.len() + 512);
44 out.push_str(system_prompt);
45
46 let needs_cn_lock = model_id.contains("minimax")
47 || model_id.contains("qwen")
48 || model_id.contains("deepseek")
49 || model_id.contains("kimi");
50 if needs_cn_lock {
51 out.push_str("\n用户可见的输出请用中文。工具调用和代码保持原样。\n");
52 }
53
54 // MiniMax M2 的 thinking 默认极其啰嗦,会大量消耗 output tokens 并拖慢响应。
55 // 模型本身没有 reasoning_effort 档位开关,只能用 prompt 约束。放在接近尾部
56 // 借助 recency 保证每轮都生效,等效于一个轻量 system-reminder。
57 if model_id.contains("minimax") {
58 out.push_str(
59 "\n<system-reminder>\n\
60 THINKING 简洁纪律:内部思考(<think> 块)必须极简,\
61 只写必要的决策线索,不要复述工具结果、不要分点展开、不要自问自答。\
62 目标 ≤ 3 句话。冗长 thinking 视为严重问题。\n\
63 </system-reminder>\n",
64 );
65 }
66
67 out
68}
69
70/// Context management with cold zone compression.
71///
72/// Structure: [System] [Cold Zone (max 3 summaries)] [Last 5 turns full]
73///
74/// The cold zone is populated by `Conversation::apply_compression` when
75/// total tokens exceed ~70% of budget. If still over 80% after cold zone
76/// injection, this function drops oldest turns inline.
77///
78/// `turn_reminder` — if non-empty, prepended to the last User message.
79/// Keeps the system prompt prefix stable across turns (好 cache),
80/// while still delivering per-turn dynamic context (git diff, current
81/// task, etc). Empty string = no injection.
82pub fn build_messages(
83 conv: &Conversation,
84 system_prompt: &str,
85 token_budget: usize,
86 turn_reminder: &str,
87) -> (Vec<Message>, ContextStats) {
88 if conv.messages.is_empty() {
89 return (
90 vec![Message::new(Role::System, system_prompt)],
91 ContextStats::default(),
92 );
93 }
94
95 let system_msg = Message::new(Role::System, system_prompt);
96 let system_tokens = system_msg.estimate_tokens();
97
98 let turns = &conv.turn_tracker.turns;
99
100 if turns.is_empty() {
101 let remaining = token_budget.saturating_sub(system_tokens);
102 return (
103 build_messages_fallback(conv, system_msg, remaining),
104 ContextStats::default(),
105 );
106 }
107
108 let mut result = Vec::with_capacity(conv.messages.len() + 3);
109 result.push(system_msg);
110
111 // Inject cold zone summaries (if any)
112 if !conv.cold_summaries.is_empty() {
113 let cold_text = format!(
114 "[Earlier conversation history ({} compression{})]\n{}",
115 conv.cold_summaries.len(),
116 if conv.cold_summaries.len() > 1 {
117 "s"
118 } else {
119 ""
120 },
121 conv.cold_summaries.join("\n---\n")
122 );
123 result.push(Message::new(Role::System, cold_text));
124 }
125
126 // Add all current messages
127 result.extend(conv.messages.iter().cloned());
128
129 // NOTE: read_file result condensation was here (83fc7ff) but reverted.
130 // 问题: 长距离重读是合理需求(旧内容被压缩后模型需要重新看),
131 // 短距离重读在 keep_recent 保护内又压缩不到。两头不讨好。
132 // 正确方案需要更深入设计,不在这里做。
133
134 // Safety: if over 80% (or 60K absolute cap), drop oldest turns.
135 // BUT: skip if cold_summaries exist — that means LLM compression just ran
136 // and we're looking at the "keep_full=5" survivor set. Dropping those too
137 // would wipe ALL context (the bug that caused sent=0 in audit sessions).
138 let budget_80pct = (token_budget * 80 / 100).min(60000);
139 let total_tokens: usize = result.iter().map(|m| m.estimate_tokens()).sum();
140 let mut dropped_tokens = 0usize;
141
142 if total_tokens > budget_80pct && conv.cold_summaries.is_empty() {
143 let tokens_to_drop = total_tokens - budget_80pct;
144
145 // ── HARD FLOOR: the last turn is sacred and NEVER dropped ──
146 // Without this floor, a single oversized tool_result could make `tokens_to_drop`
147 // exceed the sum of all earlier turns, and the `survived_start` calculation below
148 // would settle on `conv.messages.len()` → NO messages survive → sent=0 → agent
149 // goes blind and repeats searches forever (2026-04-12 21:25 session pathology).
150 let last_turn_idx = turns.len().saturating_sub(1);
151 let last_turn_start = turns
152 .get(last_turn_idx)
153 .map(|t| t.start_idx)
154 .unwrap_or(0)
155 .min(conv.messages.len());
156
157 // First pass: identify which turns to drop and extract their reasoning.
158 // Loop bound `turns.len()-1` ensures we never touch the last turn.
159 let mut drop_summaries: Vec<String> = Vec::new();
160 let mut drop_count = 0usize;
161
162 for ti in 0..turns.len().saturating_sub(1) {
163 if dropped_tokens >= tokens_to_drop {
164 break;
165 }
166 let turn = &turns[ti];
167 let end = turn.end_idx().min(conv.messages.len());
168 if turn.start_idx >= conv.messages.len() {
169 continue;
170 }
171
172 // Extract model reasoning and tool calls before dropping
173 let turn_msgs = &conv.messages[turn.start_idx..end];
174 let mut parts: Vec<String> = Vec::new();
175 for msg in turn_msgs {
176 match &msg.content {
177 MessageContent::Text(t) if msg.role == Role::Assistant => {
178 let short: String = t.chars().take(150).collect();
179 if !short.trim().is_empty() {
180 parts.push(short);
181 }
182 }
183 MessageContent::AssistantWithToolCalls {
184 text, tool_calls, ..
185 } => {
186 if let Some(t) = text {
187 let short: String = t.chars().take(150).collect();
188 if !short.trim().is_empty() {
189 parts.push(short);
190 }
191 }
192 let tools: Vec<&str> =
193 tool_calls.iter().map(|tc| tc.name.as_str()).collect();
194 if !tools.is_empty() {
195 parts.push(format!("tools: {}", tools.join(", ")));
196 }
197 }
198 _ => {}
199 }
200 }
201 if !parts.is_empty() {
202 drop_summaries.push(parts.join(" | "));
203 }
204
205 dropped_tokens += turn_msgs.iter().map(|m| m.estimate_tokens()).sum::<usize>();
206 drop_count += 1;
207 }
208
209 // Rebuild: system + cold zone + drop digest + surviving messages
210 let cold_msgs = if conv.cold_summaries.is_empty() { 1 } else { 2 };
211 result.truncate(cold_msgs);
212
213 // Inject mechanical digest of dropped turns so model retains reasoning chain
214 if !drop_summaries.is_empty() {
215 let digest = format!(
216 "[Context overflow: {} earlier turns compressed]\n{}",
217 drop_count,
218 drop_summaries
219 .iter()
220 .enumerate()
221 .map(|(i, s)| format!("{}. {}", i + 1, s))
222 .collect::<Vec<_>>()
223 .join("\n")
224 );
225 result.push(Message::new(Role::System, digest));
226 }
227
228 // Find first surviving message, clamped to last_turn_start so the last turn always survives.
229 let mut survived_start = 0;
230 let mut skipped = 0usize;
231 for ti in 0..turns.len() {
232 let turn = &turns[ti];
233 let end = turn.end_idx().min(conv.messages.len());
234 if turn.start_idx >= conv.messages.len() {
235 continue;
236 }
237 let t: usize = conv.messages[turn.start_idx..end]
238 .iter()
239 .map(|m| m.estimate_tokens())
240 .sum();
241 skipped += t;
242 if skipped >= dropped_tokens {
243 survived_start = if ti + 1 < turns.len() {
244 turns[ti + 1].start_idx
245 } else {
246 // Old code set this to conv.messages.len() → no survivors.
247 // Clamp to last_turn_start to preserve at least the last turn.
248 last_turn_start
249 };
250 break;
251 }
252 }
253 // Final clamp: survived_start must not skip past the last turn.
254 survived_start = survived_start.min(last_turn_start);
255 result.extend(conv.messages[survived_start..].iter().cloned());
256 }
257
258 // Microcompact: condense PRIOR-TURN ToolResults to one-line stubs.
259 // Current turn (everything from last User message onward) is always
260 // full-fidelity — see the microcompact() docstring for the
261 // turn-aware boundary rationale (this fixes the pre-5-8
262 // `HELLO_TEST_12345` bug where fixed-window stubbing could clip
263 // the in-flight turn).
264 //
265 // Threshold = min(budget × 70%, 100K chars). The 100K cap keeps
266 // long-session token savings (kicks in around ~25K tokens of
267 // history); the 70%-of-budget floor protects small-context models
268 // from compacting too eagerly.
269 let microcompact_threshold =
270 ((token_budget as u64 * 4 * 70 / 100) as usize).min(100_000);
271 microcompact(&mut result, conv.messages.len(), microcompact_threshold);
272
273 replace_stale_reads(&mut result);
274 // sanitize_messages drops AssistantWithToolCalls whose tool_calls
275 // didn't all get followed by matching tool_result messages before
276 // a non-tool boundary (next ATC / Text / MultiPart). Required to
277 // satisfy DeepSeek's strict `insufficient tool messages following
278 // tool_calls message` 400 and the equivalent Claude/OpenAI/Gemini
279 // pairing contracts. Several upstream paths can leave the
280 // conversation in this state (cancel mid-batch, hard-truncate
281 // landing between ATC and its results, /resume of an old session)
282 // — sanitizing at send time is the defensive backstop that catches
283 // them all uniformly. Already wired into the fallback path
284 // (`build_messages_fallback`); this call extends the same safety net
285 // to the main turn-tracked path. Runs BEFORE clean_message_pipeline
286 // so the consecutive-User merger downstream can collapse any
287 // adjacent User messages that the dropped ATC was previously
288 // separating.
289 sanitize_messages(&mut result);
290 clean_message_pipeline(&mut result);
291
292 // ── ABSOLUTE FLOOR (runs AFTER all cleanup, right before sent_tokens calc) ──
293 // If compaction + cleanup somehow left us with only system messages, graft back
294 // the last user message so the LLM has *something* to respond to. This is the
295 // strictest possible invariant: whenever conv.messages is non-empty, the result
296 // must contain at least one non-system message.
297 let non_system_count = result
298 .iter()
299 .filter(|m| !matches!(m.role, Role::System))
300 .count();
301 if non_system_count == 0 {
302 if let Some(last_user) =
303 conv.messages.iter().rev().find(|m| {
304 matches!(m.role, Role::User) && matches!(m.content, MessageContent::Text(..))
305 })
306 {
307 result.push(Message::new(
308 Role::System,
309 "[Emergency: prior conversation was dropped during compaction. Only the latest user message is preserved.]"
310 ));
311 result.push(last_user.clone());
312 }
313 }
314
315 // ── FINAL BYTE CEILING (last-line-of-defense) ──
316 // microcompact protects the last 20 messages; the 80% drop cap at
317 // line ~181 skips entirely when `cold_summaries` is populated
318 // (legacy protection against a since-fixed pathology). That
319 // leaves the recent window with no byte enforcement, so accumulated
320 // mid-sized ToolResults can still blow the budget. Single
321 // oldest-first forward pass: condense each ToolResult once only
322 // (idempotent `condensed()` would otherwise spin), stop as soon
323 // as the total fits under 80% of budget. The last 4 messages
324 // (current turn's work) and Text / AssistantWithToolCalls are
325 // never touched.
326 let token_ceiling = token_budget.saturating_mul(80) / 100;
327 let keep_tail = 4.min(result.len());
328 let shrinkable_end = result.len().saturating_sub(keep_tail);
329 // Build call_id → tool_name so `condensed` can pick the right
330 // summarization strategy per tool (read_file → skeleton, others →
331 // first-line). Without this, `condensed` would have had to guess
332 // from output shape — a substring heuristic that false-positived
333 // on bash outputs with `" N| ..."` lines.
334 let call_id_to_tool: std::collections::HashMap<String, String> = result
335 .iter()
336 .filter_map(|m| {
337 if let MessageContent::AssistantWithToolCalls { tool_calls, .. } = &m.content {
338 Some(tool_calls.iter().map(|tc| (tc.id.clone(), tc.name.clone())))
339 } else {
340 None
341 }
342 })
343 .flatten()
344 .collect();
345 for i in 1..shrinkable_end {
346 let total: usize = result.iter().map(|m| m.estimate_tokens()).sum();
347 if total <= token_ceiling {
348 break;
349 }
350 let tool_name = match &result[i].content {
351 MessageContent::ToolResult(r) => call_id_to_tool
352 .get(&r.call_id)
353 .map(|s| s.as_str())
354 .unwrap_or(""),
355 _ => continue,
356 };
357 let before = result[i].estimate_tokens();
358 let condensed = result[i].condensed(tool_name);
359 if condensed.estimate_tokens() < before {
360 result[i] = condensed;
361 }
362 }
363
364 // Turn reminder: prepend to last User message. Runs AFTER all
365 // compaction/cleanup so the reminder always rides the most recent
366 // user turn. Keeps system_prompt itself stable (cacheable).
367 if !turn_reminder.is_empty() {
368 for msg in result.iter_mut().rev() {
369 if matches!(msg.role, Role::User) {
370 if let MessageContent::Text(ref mut text) = msg.content {
371 *text = format!("{}\n{}", turn_reminder, text);
372 break;
373 }
374 }
375 }
376 }
377
378 let sent_tokens: usize = result
379 .iter()
380 .map(|m| m.estimate_tokens())
381 .sum::<usize>()
382 .saturating_sub(system_tokens);
383 let msg_count = result.len();
384 (
385 result,
386 ContextStats {
387 system_tokens,
388 sent_tokens,
389 dropped_tokens,
390 total_messages: msg_count,
391 },
392 )
393}
394
395/// Reserved headroom for large windows (CC / Anthropic 200K territory)
396/// where compaction can afford to leave a generous response + tool-result
397/// runway. Mirrors CC's `AUTOCOMPACT_BUFFER_TOKENS`.
398pub const AUTO_COMPACT_BUFFER_LARGE: usize = 13_000;
399
400/// Reserved headroom for small/proxy-bound windows (typical self-hosted
401/// GLM 65K). 5K leaves space for one streaming response + a round of
402/// tool results without forcing compaction so early it shrinks the
403/// usable session. Larger buffers (13K) on a 65K cap kick compaction at
404/// 52K — wasting the 12K immediately above where users do real work.
405pub const AUTO_COMPACT_BUFFER_SMALL: usize = 5_000;
406
407/// Cutoff between "small" and "large" windows. 100K is the natural
408/// dividing line: anything ≤ 100K is a self-hosted / proxy-bound
409/// deployment that benefits from a tight buffer; anything > 100K is a
410/// vendor offering (Anthropic 200K, etc.) where the wider buffer
411/// matches CC's behaviour.
412pub const AUTO_COMPACT_LARGE_WINDOW_FROM: usize = 100_000;
413
414/// Compute the auto-compression trigger threshold for a given context
415/// window. Returns the token total above which `needs_compression` fires.
416///
417/// Buffer scales with window size:
418/// - ≤ 100K (proxy-bound): 5K buffer → 65K window → 60K trigger.
419/// - > 100K (vendor large): 13K buffer → 200K window → 187K trigger.
420/// - Either branch caps at `ctx_window / 4` so degenerate small windows
421/// (8K Ollama) still land on a meaningful 6K threshold rather than
422/// underflowing to 0.
423pub fn auto_compact_threshold(token_budget: usize) -> usize {
424 let raw_buffer = if token_budget > AUTO_COMPACT_LARGE_WINDOW_FROM {
425 AUTO_COMPACT_BUFFER_LARGE
426 } else {
427 AUTO_COMPACT_BUFFER_SMALL
428 };
429 let buffer = raw_buffer.min(token_budget / 4);
430 token_budget.saturating_sub(buffer)
431}
432
433/// Check if context needs compression.
434///
435/// Threshold derived from `auto_compact_threshold` — fires when fewer
436/// than `buffer` tokens remain (5K for ≤100K windows, 13K for >100K).
437/// Buffer scales with the deployment: self-hosted GLM at 65K trips
438/// at 60K (4K runway is plenty for one round); Anthropic at 200K
439/// trips at 187K, matching CC's behaviour.
440///
441/// The `messages.len() < 12` guard stays — needs a non-trivial backlog
442/// before compression is worthwhile, and 1 user msg can produce 15+
443/// messages so message count is the right unit.
444pub fn needs_compression(
445 conv: &Conversation,
446 system_prompt_tokens: usize,
447 token_budget: usize,
448) -> bool {
449 if conv.messages.len() < 12 {
450 return false;
451 }
452 let total: usize = system_prompt_tokens
453 + conv
454 .messages
455 .iter()
456 .map(|m| m.estimate_tokens())
457 .sum::<usize>();
458 total > auto_compact_threshold(token_budget)
459}
460
461/// Build content for LLM compression.
462///
463/// Strategy: keep the last `KEEP_MESSAGES` messages at full fidelity,
464/// compress everything before that into one-line-per-round summaries.
465/// Returns `(compressed_text, number_of_messages_to_remove)`.
466///
467/// This operates at MESSAGE level, not turn level, because `turn_tracker`
468/// counts user messages (1 user msg = 1 turn) but a single user message
469/// can produce 15+ LLM calls with 35+ messages.
470pub fn build_compression_content(conv: &Conversation) -> (String, usize) {
471 if conv.messages.len() <= KEEP_MESSAGES {
472 return (String::new(), 0);
473 }
474
475 let mut compress_end_idx = conv.messages.len() - KEEP_MESSAGES;
476
477 // ── Pair-preserving snap ──
478 // Anthropic API requires every `tool_result` to have its paired
479 // `tool_use` in the same conversation. If the naive cut lands on a
480 // ToolResult whose ATC lives in the drop range, the surviving range
481 // begins with an orphan — `clean_message_pipeline` would silently
482 // drop it and we'd lose the edit confirmation / tool output.
483 //
484 // Advance the cut forward past any trailing ToolResults so they
485 // get dropped WITH their paired ATC (already in the drop range),
486 // not kept as orphans. `compress_msgs` below uses the same index
487 // so the summary captures these results too.
488 while compress_end_idx < conv.messages.len() {
489 match &conv.messages[compress_end_idx].content {
490 message::MessageContent::ToolResult(_) | message::MessageContent::ToolResultRef(_) => {
491 compress_end_idx += 1;
492 }
493 _ => break,
494 }
495 }
496
497 // If snapping consumed all remaining messages, nothing to compress.
498 if compress_end_idx >= conv.messages.len() {
499 return (String::new(), 0);
500 }
501
502 // Group messages into logical rounds (assistant + tool_calls + tool_results)
503 // and compress each round into a one-liner.
504 let mut content = String::new();
505 let mut round = 0usize;
506 let compress_msgs = &conv.messages[..compress_end_idx];
507 let mut i = 0;
508 while i < compress_msgs.len() {
509 // Collect messages for this round
510 let round_start = i;
511 // A round starts at a User or Assistant message and includes
512 // all subsequent tool results until the next User/Assistant.
513 i += 1;
514 while i < compress_msgs.len() {
515 match compress_msgs[i].role {
516 message::Role::User | message::Role::Assistant => break,
517 _ => i += 1,
518 }
519 }
520 round += 1;
521 let round_msgs = &compress_msgs[round_start..i];
522 content.push_str(&compress_turn(round, round_msgs));
523 content.push('\n');
524 }
525
526 // Return message count (not turn count) for apply_compression
527 (content, compress_end_idx)
528}
529
530// ─── private helpers ────────────────────────────────────────────────
531
532/// Compress a turn into a one-line mechanical summary.
533/// No LLM call — deterministic, fast, never fails.
534/// Format: "Turn N: user asked X → read file.js, edited file.js (-3 +5 lines)"
535// ── INVARIANT (2026-04-16): compress_turn MUST preserve assistant thinking ──
536// The assistant's text (thinking/reasoning) in AssistantWithToolCalls is the
537// diagnostic conclusion for that turn ("代码逻辑看起来正确", "问题找到了!ID不匹配").
538// Without it, the compressed summary says only "read main.ts, grep closeSettings"
539// — the model doesn't know it already confirmed the logic was correct, so it
540// searches the same files again. 39-turn loop sessions traced to this omission.
541fn compress_turn(turn_num: usize, turn_msgs: &[Message]) -> String {
542 let mut user_text = String::new();
543 let mut assistant_text = String::new();
544 let mut tools: Vec<String> = Vec::new();
545
546 for msg in turn_msgs {
547 match (&msg.role, &msg.content) {
548 (Role::User, MessageContent::Text(s)) => {
549 if !s.starts_with('[') {
550 // skip system-injected messages
551 user_text = if s.chars().count() > 60 {
552 format!("{}...", s.chars().take(57).collect::<String>())
553 } else {
554 s.clone()
555 };
556 }
557 }
558 (
559 _,
560 MessageContent::AssistantWithToolCalls {
561 text, tool_calls, ..
562 },
563 ) => {
564 // Preserve assistant's diagnostic conclusion (first 80 chars).
565 if let Some(t) = text {
566 let trimmed = t.trim();
567 if !trimmed.is_empty() && assistant_text.is_empty() {
568 assistant_text = if trimmed.chars().count() > 80 {
569 format!("{}...", trimmed.chars().take(77).collect::<String>())
570 } else {
571 trimmed.to_string()
572 };
573 }
574 }
575 for tc in tool_calls {
576 let short = if let Ok(args) =
577 serde_json::from_str::<serde_json::Value>(&tc.arguments)
578 {
579 let fp = args.get("file_path").and_then(|v| v.as_str()).map(|p| {
580 std::path::Path::new(p)
581 .file_name()
582 .map(|n| n.to_string_lossy().to_string())
583 .unwrap_or_else(|| p.to_string())
584 });
585 match (tc.name.as_str(), fp) {
586 ("read_file", Some(f)) => format!("read {}", f),
587 ("edit_file", Some(f)) => format!("edit {}", f),
588 ("write_file", Some(f)) => format!("write {}", f),
589 ("grep", _) => {
590 let pat =
591 args.get("pattern").and_then(|v| v.as_str()).unwrap_or("?");
592 format!("grep({})", pat)
593 }
594 ("bash", _) => {
595 let cmd =
596 args.get("command").and_then(|v| v.as_str()).unwrap_or("?");
597 let short_cmd: String = cmd.chars().take(30).collect();
598 format!("bash({})", short_cmd)
599 }
600 (name, _) => name.to_string(),
601 }
602 } else {
603 tc.name.clone()
604 };
605 if !tools.contains(&short) {
606 tools.push(short);
607 }
608 }
609 }
610 (Role::Assistant, MessageContent::Text(s)) => {
611 if assistant_text.is_empty() {
612 let trimmed = s.trim();
613 if !trimmed.is_empty() {
614 assistant_text = if trimmed.chars().count() > 80 {
615 format!("{}...", trimmed.chars().take(77).collect::<String>())
616 } else {
617 trimmed.to_string()
618 };
619 }
620 }
621 }
622 (_, MessageContent::ToolResult(r)) if !r.success => {
623 tools.push("FAILED".to_string());
624 }
625 _ => {}
626 }
627 }
628
629 let tools_str = if tools.is_empty() {
630 "no tools".to_string()
631 } else {
632 tools.join(", ")
633 };
634
635 let prefix = if !user_text.is_empty() {
636 format!("\"{}\" ", user_text)
637 } else {
638 String::new()
639 };
640 let conclusion = if !assistant_text.is_empty() {
641 format!("[{}] ", assistant_text)
642 } else {
643 String::new()
644 };
645 format!(
646 "- Turn {}: {}{}→ {}",
647 turn_num, prefix, conclusion, tools_str
648 )
649}
650
651/// Fallback windowing when no turns are tracked.
652/// Keeps as many recent messages as fit within 60% of remaining budget.
653fn build_messages_fallback(
654 conv: &Conversation,
655 system_msg: Message,
656 remaining_budget: usize,
657) -> Vec<Message> {
658 let budget = remaining_budget * 60 / 100;
659 let mut used = 0usize;
660 let mut start = conv.messages.len();
661
662 for i in (0..conv.messages.len()).rev() {
663 let msg_tokens = conv.messages[i].estimate_tokens();
664 if used + msg_tokens > budget {
665 break;
666 }
667 used += msg_tokens;
668 start = i;
669 }
670 start = snap_to_valid_boundary(&conv.messages, start);
671
672 let mut result = Vec::with_capacity(conv.messages.len() - start + 1);
673 result.push(system_msg);
674 result.extend(conv.messages[start..].iter().cloned());
675 sanitize_messages(&mut result);
676 result
677}
678
679/// Snap an index to a valid message boundary for the API.
680fn snap_to_valid_boundary(messages: &[Message], idx: usize) -> usize {
681 let mut start = idx.min(messages.len());
682
683 // Skip orphan ToolResult/ToolResultRef messages
684 while start < messages.len() {
685 match &messages[start].content {
686 MessageContent::ToolResult(_) | MessageContent::ToolResultRef(_) => start += 1,
687 _ => break,
688 }
689 }
690
691 // Prefer starting at a User message
692 let original = start;
693 while start < messages.len() {
694 if matches!(messages[start].role, Role::User | Role::System) {
695 break;
696 }
697 start += 1;
698 if start > original + 5 {
699 return original;
700 }
701 }
702 start
703}
704
705// ─── Message-list manipulation helpers used during render ───────────
706// These operate on `&mut Vec<Message>` and are called by
707// `build_messages` to apply rolling condensation / freshness
708// replacement / sanity cleanup.
709
710/// Floor for collapse: outputs smaller than this are left alone.
711/// Doubles as the idempotence guarantee — every stub we produce is
712/// well under this size, so re-running compaction never re-stubs.
713pub(crate) const MIN_COLLAPSE_SIZE: usize = 500;
714
715/// Build the generic compaction stub used by both microcompact (render
716/// time, ephemeral) and the conv-level Tier 1 (destructive). Tool name
717/// comes from the model's own tool_calls so the framework adds zero
718/// hardcoded tool knowledge — every tool gets the same shape.
719///
720/// **First-line picking**: skips `[elapsed: ...]` framework metadata.
721/// `tool::bash` prepends `[elapsed: Xs, exit: N]\n<actual output>` to
722/// every bash result (see bash.rs:540). 5-7 atomgr datalog showed all
723/// 1704 bash stubs surfaced this metadata as `first:` content — model
724/// got "1.9s, exit 101" instead of the actual error. Skipping to line 2
725/// flips the stub from "exit code only" to "actual error / actual
726/// output preview". Falls back to line 1 when there's no line 2
727/// (single-line bash like `wc -l`). Non-bash tools (grep, edit_file,
728/// web_fetch) don't have this prefix → unaffected.
729///
730/// **Hardcoding note**: matching `[elapsed:` is framework-internal
731/// knowledge of our own bash tool's output format, not tech-stack
732/// hardcoding (the prefix is the same regardless of cargo/npm/etc).
733/// Same category as the `read_file` skip in microcompact.
734pub(crate) fn build_compact_stub(tool_name: &str, output: &str, success: bool) -> String {
735 let line_count = output.lines().count();
736 let first_line: String = {
737 let mut iter = output.lines();
738 let l1 = iter.next().unwrap_or("(empty)");
739 let chosen = if l1.starts_with("[elapsed:") {
740 iter.next().unwrap_or(l1)
741 } else {
742 l1
743 };
744 chosen.chars().take(80).collect()
745 };
746 let status = if success { "ok" } else { "FAILED" };
747 format!(
748 "[{} {}: {} lines, first: {}]",
749 tool_name, status, line_count, first_line,
750 )
751}
752
753/// Build a `call_id -> tool_name` lookup from a slice of messages. The
754/// `MessageContent::AssistantWithToolCalls` variant carries the model's
755/// own tool name; this is what we surface in stubs.
756fn build_call_id_to_tool_map(
757 msgs: &[Message],
758) -> std::collections::HashMap<String, String> {
759 let mut map = std::collections::HashMap::new();
760 for msg in msgs {
761 if let MessageContent::AssistantWithToolCalls { tool_calls, .. } = &msg.content {
762 for tc in tool_calls {
763 map.insert(tc.id.clone(), tc.name.clone());
764 }
765 }
766 }
767 map
768}
769
770/// Conv-level Tier 1 compaction. Replaces tool_result bodies in turns
771/// older than `keep_recent_turns` with the same generic stub used by
772/// microcompact. This is the destructive counterpart: microcompact runs
773/// every render and is ephemeral (only mutates the rendered Vec); this
774/// runs from the agent emergency path and permanently shrinks
775/// `conv.messages` so the next `needs_compression` check sees the
776/// freed budget.
777///
778/// Idempotent: stubs already in place are smaller than MIN_COLLAPSE_SIZE
779/// and skip the rewrite.
780pub(crate) fn compact_old_tool_results_in_place(
781 conv: &mut crate::conversation::Conversation,
782 keep_recent_turns: usize,
783) {
784 let turns = &conv.turn_tracker.turns;
785 if turns.len() <= keep_recent_turns {
786 return;
787 }
788 let cutoff_turn = turns.len() - keep_recent_turns;
789 let cutoff_msg = turns[cutoff_turn].start_idx.min(conv.messages.len());
790
791 let call_id_to_tool = build_call_id_to_tool_map(&conv.messages);
792
793 for i in 0..cutoff_msg {
794 let MessageContent::ToolResult(ref tr) = conv.messages[i].content else {
795 continue;
796 };
797 if tr.output.len() <= MIN_COLLAPSE_SIZE {
798 continue;
799 }
800 let tool_name = call_id_to_tool
801 .get(&tr.call_id)
802 .map(|s| s.as_str())
803 .unwrap_or("tool");
804 let summary = build_compact_stub(tool_name, &tr.output, tr.success);
805 conv.messages[i].content = MessageContent::ToolResult(crate::tool::ToolResult {
806 call_id: tr.call_id.clone(),
807 output: summary,
808 success: tr.success,
809 });
810 }
811}
812
813/// Microcompact: condense **prior-turn** `ToolResult` messages to one-line
814/// semantic summaries. Zero LLM calls — purely mechanical compression.
815///
816/// **Turn-aware boundary (5-8 redesign).** Earlier versions used a
817/// fixed `OTHER_KEEP = 20` last-messages window. That window slid every
818/// LLM round, so within ONE user turn the model's earlier tool results
819/// got progressively stubbed as the model emitted more tool calls —
820/// the "model echoes HELLO_TEST_12345 to verify it can see anything"
821/// 5-8 atomgr session was caused by this. Now we anchor on the last
822/// `Role::User` message in the rendered Vec: everything from that
823/// message onward IS the current turn and stays full-fidelity; only
824/// strictly older content is eligible for stubbing.
825///
826/// **Threshold (5-8 redesign).** Earlier capped at 100K chars (~25K
827/// tokens) → triggered at ~20% of a 131K-token window, way too eager.
828/// Now `threshold_chars = 70% × token_budget × 4` (uncapped) so
829/// microcompact only fires when the conversation is genuinely close
830/// to filling the model's window. Below 70% it's a no-op.
831///
832/// **Stub format.** `[<tool_name> <ok|FAILED>: N lines, first: <80c>]`.
833/// Tool name comes from the model's own `tool_calls.name` (no
834/// `match tool_name { "bash" => ... }` framework branches). `read_file`
835/// is exempted by hardcoded skip — see in-line comment for rationale.
836fn microcompact(msgs: &mut Vec<Message>, _total_msg_count: usize, threshold_chars: usize) {
837 let total_chars: usize = msgs
838 .iter()
839 .map(|m| match &m.content {
840 MessageContent::ToolResult(r) => r.output.len(),
841 MessageContent::Text(t) => t.len(),
842 _ => 100,
843 })
844 .sum();
845 if total_chars < threshold_chars {
846 return;
847 }
848
849 // Anchor on the last User message — everything after it is the
850 // ACTIVE turn and must stay full. If no User message (cold start
851 // / system-only), there's nothing to compress yet.
852 let current_turn_start = match msgs
853 .iter()
854 .rposition(|m| matches!(m.role, Role::User))
855 {
856 Some(i) => i,
857 None => return,
858 };
859
860 let cold_msgs = msgs
861 .iter()
862 .position(|m| !matches!(m.role, Role::System))
863 .unwrap_or(0);
864
865 if cold_msgs >= current_turn_start {
866 return; // nothing between system and current turn
867 }
868
869 let call_id_to_tool = build_call_id_to_tool_map(msgs);
870
871 for i in cold_msgs..current_turn_start {
872 let MessageContent::ToolResult(ref r) = msgs[i].content else {
873 continue;
874 };
875
876 if r.output.len() <= MIN_COLLAPSE_SIZE {
877 continue;
878 }
879
880 let tool_name = call_id_to_tool
881 .get(&r.call_id)
882 .map(|s| s.as_str())
883 .unwrap_or("tool");
884
885 // read_file 永远不被 microcompact 压缩。stub 给模型的
886 // `first: 205| pub async fn dynamic_connect(` 信息会制造"伪自信"
887 // ——模型以为还记得函数体就直接 edit,结果反复修同一个文件
888 // (5-7 atomgr datalog T22-T29 实证 6 turn 反复修补)。保留全文
889 // 让模型在 edit 系列 turn 里始终看到最新代码。
890 // D3 FileStore 已经处理 re-read 的 disk-side 成本;prompt-side
891 // 多花 5-10% token 换"模型不丢上下文",是值得的交易。
892 //
893 // 关于硬编码: 这里直接字符串比较 "read_file",而非工具自声明
894 // (e.g. trait fn microcompact_eligible)。妥协理由:
895 // (a) "read_file" 是框架自家工具名常量,不是 cargo/npm/pytest
896 // 这类技术栈关键字,不违反"框架对技术栈中立"的项目铁律;
897 // (b) 改成 trait 方法需要把 ToolRegistry 引用穿进 render 层,
898 // 渲染路径调用面增大,收益不抵成本;
899 // (c) 仅此一处,未来如有第二个工具也要豁免,再重构成 trait。
900 if tool_name == "read_file" {
901 continue;
902 }
903
904 let summary = build_compact_stub(tool_name, &r.output, r.success);
905
906 msgs[i].content = MessageContent::ToolResult(crate::tool::ToolResult {
907 call_id: r.call_id.clone(),
908 output: summary,
909 success: r.success,
910 });
911 }
912}
913
914/// Replace stale read_file results with current disk content.
915/// When a file was read then later edited, the old read result is outdated.
916/// This replaces it so the model always sees the latest version.
917fn replace_stale_reads(msgs: &mut Vec<Message>) {
918 struct ReadInfo {
919 file_path: String,
920 offset: Option<usize>,
921 limit: Option<usize>,
922 }
923 let mut call_id_to_read: std::collections::HashMap<String, ReadInfo> =
924 std::collections::HashMap::new();
925 let mut edit_call_to_file: std::collections::HashMap<String, String> =
926 std::collections::HashMap::new();
927 let mut edited_files: std::collections::HashSet<String> = std::collections::HashSet::new();
928
929 for msg in msgs.iter() {
930 if let MessageContent::AssistantWithToolCalls { tool_calls, .. } = &msg.content {
931 for tc in tool_calls {
932 if let Ok(args) = serde_json::from_str::<serde_json::Value>(&tc.arguments) {
933 let file_path = args
934 .get("file_path")
935 .and_then(|v| v.as_str())
936 .unwrap_or("")
937 .to_string();
938 if tc.name == "read_file" && !file_path.is_empty() {
939 let offset = args
940 .get("offset")
941 .and_then(|v| v.as_u64())
942 .map(|v| v as usize);
943 let limit = args
944 .get("limit")
945 .and_then(|v| v.as_u64())
946 .map(|v| v as usize);
947 call_id_to_read.insert(
948 tc.id.clone(),
949 ReadInfo {
950 file_path: file_path.clone(),
951 offset,
952 limit,
953 },
954 );
955 }
956 if matches!(tc.name.as_str(), "edit_file" | "write_file" | "create_file")
957 && !file_path.is_empty()
958 {
959 edit_call_to_file.insert(tc.id.clone(), file_path);
960 }
961 }
962 }
963 }
964 if let MessageContent::ToolResult(ref r) = msg.content {
965 if let Some(file_path) = edit_call_to_file.get(&r.call_id) {
966 if !r.output.starts_with("Error") {
967 edited_files.insert(file_path.clone());
968 }
969 }
970 }
971 }
972
973 if edited_files.is_empty() {
974 return;
975 }
976
977 for msg in msgs.iter_mut() {
978 if let MessageContent::ToolResult(ref mut r) = msg.content {
979 if let Some(info) = call_id_to_read.get(&r.call_id) {
980 if !edited_files.contains(&info.file_path) {
981 continue;
982 }
983 if let Ok(content) = std::fs::read_to_string(&info.file_path) {
984 let all_lines: Vec<&str> = content.lines().collect();
985 let total = all_lines.len();
986
987 if info.offset.is_some() || info.limit.is_some() {
988 let start = info.offset.unwrap_or(1).max(1) - 1;
989 let start = start.min(total);
990 let end = info.limit.map(|l| (start + l).min(total)).unwrap_or(total);
991 let display: String = all_lines[start..end]
992 .iter()
993 .enumerate()
994 .map(|(i, l)| format!("{:>4}| {}", start + i + 1, l))
995 .collect::<Vec<_>>()
996 .join("\n");
997 r.output = display;
998 } else if total <= 300 {
999 r.output = all_lines
1000 .iter()
1001 .enumerate()
1002 .map(|(i, l)| format!("{:>4}| {}", i + 1, l))
1003 .collect::<Vec<_>>()
1004 .join("\n");
1005 }
1006 // else: large-file full-read, keep existing skeleton as-is.
1007 }
1008 }
1009 }
1010 }
1011}
1012
1013/// Walk forward tracking tool_call/tool_result pairing; remove orphans.
1014/// Valid sequences: System → (User → Assistant/AssistantWithToolCalls → [ToolResult]* → ...)*
1015///
1016/// Drops three kinds of broken state:
1017///
1018/// 1. **Orphan ToolResult** — appears outside any `expecting` window
1019/// (no preceding AssistantWithToolCalls awaiting it). Removed solo.
1020/// 2. **Mid-conversation under-paired AssistantWithToolCalls** — has N
1021/// tool_calls but a Text / MultiPart / next ATC arrives before all N
1022/// ToolResults have been seen. The unsatisfied ATC AND any partial
1023/// ToolResults already paired with it are removed together. This is
1024/// the path that triggers DeepSeek's `insufficient tool messages
1025/// following tool_calls message` 400 — the strictest providers
1026/// require the wire-level invariant `len(asst.tool_calls) ==
1027/// len(following tool messages)` to hold for every ATC, not just the
1028/// most recent one.
1029/// 3. **Trailing under-paired AssistantWithToolCalls** — same as (2)
1030/// but the conversation ends mid-pairing. Handled by the rev-scan
1031/// after the main loop.
1032fn sanitize_messages(msgs: &mut Vec<Message>) {
1033 let mut to_remove: Vec<usize> = Vec::new();
1034 let mut expecting_tool_results = 0usize;
1035 // Track the most recent ATC and the ToolResult indices already
1036 // paired with it. On a boundary (Text / MultiPart / next ATC) with
1037 // `expecting > 0`, both the ATC and its partial results are dropped.
1038 let mut current_atc_idx: Option<usize> = None;
1039 let mut current_atc_results: Vec<usize> = Vec::new();
1040
1041 for i in 0..msgs.len() {
1042 match &msgs[i].content {
1043 MessageContent::ToolResult(_) | MessageContent::ToolResultRef(_) => {
1044 if expecting_tool_results > 0 {
1045 expecting_tool_results -= 1;
1046 current_atc_results.push(i);
1047 } else {
1048 to_remove.push(i);
1049 }
1050 }
1051 MessageContent::AssistantWithToolCalls { tool_calls, .. } => {
1052 if expecting_tool_results > 0 {
1053 if let Some(idx) = current_atc_idx {
1054 to_remove.push(idx);
1055 }
1056 to_remove.extend(current_atc_results.drain(..));
1057 } else {
1058 current_atc_results.clear();
1059 }
1060 expecting_tool_results = tool_calls.len();
1061 current_atc_idx = Some(i);
1062 }
1063 MessageContent::Text(_) | MessageContent::MultiPart { .. } => {
1064 if expecting_tool_results > 0 {
1065 if let Some(idx) = current_atc_idx {
1066 to_remove.push(idx);
1067 }
1068 to_remove.extend(current_atc_results.drain(..));
1069 } else {
1070 current_atc_results.clear();
1071 }
1072 expecting_tool_results = 0;
1073 current_atc_idx = None;
1074 }
1075 }
1076 }
1077
1078 if expecting_tool_results > 0 {
1079 for i in (0..msgs.len()).rev() {
1080 match &msgs[i].content {
1081 MessageContent::AssistantWithToolCalls { .. } => {
1082 to_remove.push(i);
1083 break;
1084 }
1085 MessageContent::ToolResult(_) | MessageContent::ToolResultRef(_) => {
1086 to_remove.push(i);
1087 }
1088 _ => break,
1089 }
1090 }
1091 }
1092
1093 to_remove.sort_unstable();
1094 to_remove.dedup();
1095 for &idx in to_remove.iter().rev() {
1096 msgs.remove(idx);
1097 }
1098}
1099
1100/// Clean message pipeline before sending to API.
1101/// Removes noise that degrades model decision quality:
1102/// - Empty/whitespace-only assistant messages
1103/// - Orphaned tool results (no matching tool_use)
1104/// - Consecutive same-role user messages (merge into one)
1105/// - Consecutive system messages (merge into one) — MiniMax-M2.7 rejects
1106/// adjacent `system` turns with `2013 invalid chat setting`; the
1107/// post-compression layout (orig system + cold-zone + drop-digest) is
1108/// the trigger.
1109fn clean_message_pipeline(msgs: &mut Vec<Message>) {
1110 // 1. Remove empty assistant messages (e.g., after <think> stripping)
1111 msgs.retain(|m| {
1112 if m.role == Role::Assistant {
1113 match &m.content {
1114 MessageContent::Text(t) => !t.trim().is_empty(),
1115 _ => true,
1116 }
1117 } else {
1118 true
1119 }
1120 });
1121
1122 // 2. Collect valid tool_use IDs from assistant messages
1123 let mut valid_call_ids: std::collections::HashSet<String> = std::collections::HashSet::new();
1124 for msg in msgs.iter() {
1125 if let MessageContent::AssistantWithToolCalls { tool_calls, .. } = &msg.content {
1126 for tc in tool_calls {
1127 valid_call_ids.insert(tc.id.clone());
1128 }
1129 }
1130 }
1131
1132 // 3. Remove orphaned tool results (no matching tool_use)
1133 msgs.retain(|m| {
1134 if let MessageContent::ToolResult(ref r) = m.content {
1135 valid_call_ids.contains(&r.call_id)
1136 } else if let MessageContent::ToolResultRef(ref r) = m.content {
1137 valid_call_ids.contains(&r.call_id)
1138 } else {
1139 true
1140 }
1141 });
1142
1143 // 4. Merge consecutive user messages into one
1144 let mut i = 1;
1145 while i < msgs.len() {
1146 if msgs[i].role == Role::User && msgs[i - 1].role == Role::User {
1147 if let (MessageContent::Text(prev), MessageContent::Text(curr)) =
1148 (&msgs[i - 1].content, &msgs[i].content)
1149 {
1150 let merged = format!("{}\n{}", prev, curr);
1151 msgs[i - 1].content = MessageContent::Text(merged);
1152 msgs.remove(i);
1153 continue;
1154 }
1155 }
1156 i += 1;
1157 }
1158
1159 // 5. Merge consecutive system messages into one. After compression the
1160 // wire layout is `system(orig) + system(cold-zone) [+ system(drop-digest)]`,
1161 // which MiniMax-M2.7's chat-setting validator rejects (empty stream then
1162 // 400 / 2013). Blank line between blocks preserves visual separation.
1163 let mut i = 1;
1164 while i < msgs.len() {
1165 if msgs[i].role == Role::System && msgs[i - 1].role == Role::System {
1166 if let (MessageContent::Text(prev), MessageContent::Text(curr)) =
1167 (&msgs[i - 1].content, &msgs[i].content)
1168 {
1169 let merged = format!("{}\n\n{}", prev, curr);
1170 msgs[i - 1].content = MessageContent::Text(merged);
1171 msgs.remove(i);
1172 continue;
1173 }
1174 }
1175 i += 1;
1176 }
1177}
1178
1179#[cfg(test)]
1180mod tests {
1181 use super::*;
1182 use crate::conversation::message::{Message, Role};
1183 use crate::conversation::Conversation;
1184
1185 #[test]
1186 fn apply_model_directives_noop_for_generic_model() {
1187 // gpt / claude / gemini 等模型不触发任何指令 — 原 prompt 原样返回。
1188 let out = apply_model_directives("SYS", "gpt-4o");
1189 assert_eq!(out, "SYS");
1190 let out = apply_model_directives("SYS", "claude-opus-4-7");
1191 assert_eq!(out, "SYS");
1192 }
1193
1194 #[test]
1195 fn auto_compact_threshold_large_window_uses_large_buffer() {
1196 // > 100K → 13K buffer (Anthropic / CC territory). 200K - 13K = 187K.
1197 assert_eq!(auto_compact_threshold(200_000), 187_000);
1198 // 131K → boundary above the 100K cutoff, also gets 13K buffer.
1199 assert_eq!(auto_compact_threshold(131_072), 118_072);
1200 }
1201
1202 #[test]
1203 fn auto_compact_threshold_small_window_uses_small_buffer() {
1204 // ≤ 100K → 5K buffer (proxy-bound deployments). 65K - 5K = 60K
1205 // — exactly the sweet spot for a 65K self-hosted GLM cap:
1206 // compaction kicks in 5K below the proxy hard wall, leaving
1207 // a runway for one streaming response without forcing
1208 // pre-emptive compaction so early it shrinks the usable
1209 // session.
1210 assert_eq!(auto_compact_threshold(65_000), 60_000);
1211 // 100K is the boundary — still small-buffer (the cutoff is
1212 // strictly greater-than).
1213 assert_eq!(auto_compact_threshold(100_000), 95_000);
1214 // Just over 100K trips into large-buffer territory.
1215 assert_eq!(auto_compact_threshold(101_000), 88_000);
1216 }
1217
1218 #[test]
1219 fn auto_compact_threshold_tiny_window_caps_at_quarter() {
1220 // 8K Ollama: 5K buffer would still leave only 3K usable, but
1221 // window/4 = 2K caps the buffer below 5K → 6K threshold (~75%
1222 // of window). Scales the buffer when the window is too small
1223 // for the small-buffer constant.
1224 assert_eq!(auto_compact_threshold(8_000), 6_000);
1225 assert_eq!(auto_compact_threshold(16_000), 12_000);
1226 // At 20K the small-buffer constant (5K) lands at exactly
1227 // window/4, so 5K applies straight: 20K - 5K = 15K.
1228 assert_eq!(auto_compact_threshold(20_000), 15_000);
1229 }
1230
1231 #[test]
1232 fn auto_compact_threshold_handles_degenerate_window() {
1233 // ctx_window == 0 happens transiently before the provider config
1234 // loads; saturating_sub keeps it from panicking. Threshold is 0,
1235 // so any non-empty conversation trips the gate — caller's
1236 // `messages.len() < 12` check still gates the actual fire.
1237 assert_eq!(auto_compact_threshold(0), 0);
1238 }
1239
1240 #[test]
1241 fn needs_compression_fires_at_absolute_headroom_not_percentage() {
1242 // Reproduces the user's debug confusion: under the prior formula
1243 // a 131K window's threshold was `min(131K * 50%, 50K) = 50K` —
1244 // compression fired at 38% of window, leaving 81K of phantom
1245 // "available" headroom that wasn't actually used. The new
1246 // formula fires at 118K (90% of window), matching the user's
1247 // intuition of "fire when ~13K headroom remains".
1248 //
1249 // Test fixture: 15 alternating User/Assistant messages so the
1250 // 12-message guard passes (`add_user_message` merges
1251 // consecutive User msgs, which would collapse 15 calls into 1).
1252 let mut conv = Conversation::new();
1253 for i in 0..8 {
1254 conv.messages.push(Message::new(Role::User, format!("u{}", i)));
1255 conv.messages.push(Message::new(Role::Assistant, format!("a{}", i)));
1256 }
1257 assert_eq!(conv.messages.len(), 16);
1258 assert!(!needs_compression(&conv, 0, 131_072));
1259
1260 // 500K bytes ≈ 125K tokens (byte / 4) → exceeds 118K threshold.
1261 conv.messages
1262 .push(Message::new(Role::User, "x".repeat(500_000)));
1263 assert!(needs_compression(&conv, 0, 131_072));
1264 }
1265
1266 #[test]
1267 fn tool_result_ref_token_estimate_uses_summary_not_byte_size() {
1268 // Pre-fix bug: ToolResultRef estimated from the full original
1269 // content size (could be 50K+ for a large file read), but at
1270 // send time only `r.summary` (a short string) was actually
1271 // serialised. The estimator overcounted by 5-50× on
1272 // externalised results, pushing compression to fire on phantom
1273 // budget pressure.
1274 use crate::conversation::message::MessageContent;
1275 use crate::tool::result_store::ToolResultRef;
1276
1277 let big_ref = ToolResultRef {
1278 call_id: "call_1".into(),
1279 hash: "deadbeef".into(),
1280 summary: "hello".into(), // 5 bytes
1281 byte_size: 200_000, // pretend the disk-cached blob is 200KB
1282 success: true,
1283 };
1284 let msg = Message {
1285 role: Role::User,
1286 content: MessageContent::ToolResultRef(big_ref),
1287 };
1288 // (5 + 10) / 4 + 4 = 7. Pre-fix this was (200000 + 10) / 4 + 4 = 50006.
1289 assert!(
1290 msg.estimate_tokens() < 20,
1291 "expected estimate to track summary size, got {}",
1292 msg.estimate_tokens()
1293 );
1294 }
1295
1296 #[test]
1297 fn apply_model_directives_cn_lock_for_cjk_tier() {
1298 for id in ["qwen3-max", "deepseek-v3", "kimi-k2"] {
1299 let out = apply_model_directives("SYS", id);
1300 assert!(
1301 out.contains("用户可见的输出请用中文"),
1302 "model {id} missing CN lock"
1303 );
1304 assert!(
1305 !out.contains("THINKING 简洁纪律"),
1306 "model {id} got MiniMax directive erroneously"
1307 );
1308 }
1309 }
1310
1311 #[test]
1312 fn apply_model_directives_minimax_gets_both_blocks() {
1313 let out = apply_model_directives("SYS", "minimax-m2");
1314 assert!(out.contains("用户可见的输出请用中文"));
1315 assert!(out.contains("THINKING 简洁纪律"));
1316 // MiniMax 指令必须在 CN lock 之后(recency: 更尾部 = 更高优先级)
1317 let cn_idx = out.find("用户可见的输出").unwrap();
1318 let thinking_idx = out.find("THINKING").unwrap();
1319 assert!(thinking_idx > cn_idx);
1320 }
1321
1322 #[test]
1323 fn apply_model_directives_preserves_system_prompt_prefix() {
1324 // 追加模式:原 prompt 必须 100% 保留在开头,cache key 不破坏。
1325 let sys = "You are AtomCode. Working directory: /tmp\n";
1326 let out = apply_model_directives(sys, "minimax-m2");
1327 assert!(out.starts_with(sys));
1328 }
1329
1330 #[test]
1331 fn test_budgeted_empty_conversation() {
1332 let conv = Conversation::new();
1333 let (msgs, _stats) = build_messages(&conv, "system prompt", 8000, "");
1334 assert_eq!(msgs.len(), 1);
1335 assert!(matches!(msgs[0].role, Role::System));
1336 }
1337
1338 #[test]
1339 fn test_budgeted_includes_recent_messages() {
1340 let mut conv = Conversation::new();
1341 conv.add_user_message("hello");
1342 conv.messages
1343 .push(Message::new(Role::Assistant, "hi there"));
1344 conv.add_user_message("do something");
1345
1346 let (msgs, _stats) = build_messages(&conv, "sys", 8000, "");
1347 assert_eq!(msgs.len(), 4); // system + 3 messages
1348 assert!(matches!(msgs[0].role, Role::System));
1349 }
1350
1351 #[test]
1352 fn test_budgeted_sends_all_when_under_80pct() {
1353 use crate::tool::{ToolCall, ToolResult};
1354 let mut conv = Conversation::new();
1355
1356 // Create 2 turns with small tool results — should all fit
1357 for turn in 0..2 {
1358 conv.add_user_message(&format!("task {}", turn));
1359 let call = ToolCall {
1360 id: format!("call_{}", turn),
1361 name: "read_file".to_string(),
1362 arguments: format!(r#"{{"file_path":"/tmp/file_{}.rs"}}"#, turn),
1363 };
1364 conv.add_assistant_tool_calls(None, vec![call], None);
1365 conv.add_tool_result(ToolResult {
1366 call_id: format!("call_{}", turn),
1367 output: "short result".to_string(),
1368 success: true,
1369 });
1370 }
1371 conv.add_user_message("now what?");
1372
1373 // Large budget — everything fits
1374 let (msgs, stats) = build_messages(&conv, "sys", 100000, "");
1375 // system + 7 messages (2 turns * 3 msgs each + final user)
1376 assert_eq!(msgs.len(), 8);
1377 assert!(matches!(msgs[0].role, Role::System));
1378 assert_eq!(msgs.last().unwrap().text(), Some("now what?"));
1379 assert_eq!(stats.dropped_tokens, 0, "Nothing should be dropped");
1380 }
1381
1382 #[test]
1383 fn test_budgeted_drops_oldest_turns_when_over_budget() {
1384 use crate::tool::{ToolCall, ToolResult};
1385 let mut conv = Conversation::new();
1386
1387 // Create 5 turns with large tool results (2000 chars each ≈ 500 tokens)
1388 // Total ≈ 5 * 4 * 500 = 10000 tokens + overhead, budget 80% of 4000 = 3200
1389 for turn in 0..5 {
1390 conv.add_user_message(&format!("task {}", turn));
1391 for i in 0..4 {
1392 let idx = turn * 4 + i;
1393 let call = ToolCall {
1394 id: format!("call_{}", idx),
1395 name: "read_file".to_string(),
1396 arguments: format!(r#"{{"file_path":"/tmp/file_{}.rs"}}"#, idx),
1397 };
1398 conv.add_assistant_tool_calls(None, vec![call], None);
1399 conv.add_tool_result(ToolResult {
1400 call_id: format!("call_{}", idx),
1401 output: "x".repeat(2000),
1402 success: true,
1403 });
1404 }
1405 }
1406 conv.add_user_message("now what?");
1407
1408 let (msgs, stats) = build_messages(&conv, "sys", 4000, "");
1409 // Oldest turns should be dropped
1410 assert!(
1411 stats.dropped_tokens > 0,
1412 "Some turns should have been dropped"
1413 );
1414 // Most recent user message must survive
1415 assert_eq!(msgs.last().unwrap().text(), Some("now what?"));
1416 // System prompt must be first
1417 assert!(matches!(msgs[0].role, Role::System));
1418 }
1419
1420 #[test]
1421 fn test_budgeted_always_keeps_latest_turn() {
1422 use crate::tool::{ToolCall, ToolResult};
1423 let mut conv = Conversation::new();
1424
1425 // Create a single turn with very large output
1426 conv.add_user_message("big task");
1427 let call = ToolCall {
1428 id: "c0".to_string(),
1429 name: "bash".to_string(),
1430 arguments: "{}".to_string(),
1431 };
1432 conv.add_assistant_tool_calls(Some("running..."), vec![call], None);
1433 conv.add_tool_result(ToolResult {
1434 call_id: "c0".to_string(),
1435 output: "z".repeat(50000),
1436 success: true,
1437 });
1438
1439 // Very small budget — system prompt is always kept
1440 let (msgs, _stats) = build_messages(&conv, "sys", 1000, "");
1441 assert!(!msgs.is_empty(), "Must at least have system prompt");
1442 assert!(matches!(msgs[0].role, Role::System));
1443 }
1444
1445 #[test]
1446 fn test_budgeted_never_returns_system_only_when_messages_exist() {
1447 // Regression for 2026-04-13 bug: a single oversized tool_result caused
1448 // `survived_start = self.messages.len()` → no non-system messages in result
1449 // → sent=0 → agent blind.
1450 //
1451 // Invariant: if self.messages is non-empty, to_provider_messages_budgeted
1452 // must always include at least one non-system message.
1453 use crate::tool::{ToolCall, ToolResult};
1454 let mut conv = Conversation::new();
1455
1456 // 5 normal turns
1457 for i in 0..5 {
1458 conv.add_user_message(&format!("task {}", i));
1459 let call = ToolCall {
1460 id: format!("c{}", i),
1461 name: "bash".to_string(),
1462 arguments: "{}".to_string(),
1463 };
1464 conv.add_assistant_tool_calls(Some("ok"), vec![call], None);
1465 conv.add_tool_result(ToolResult {
1466 call_id: format!("c{}", i),
1467 output: "x".repeat(500),
1468 success: true,
1469 });
1470 }
1471
1472 // 6th turn with a pathologically oversized output (50K tokens worth of 'z')
1473 conv.add_user_message("find everything");
1474 let call = ToolCall {
1475 id: "c5".to_string(),
1476 name: "bash".to_string(),
1477 arguments: "{}".to_string(),
1478 };
1479 conv.add_assistant_tool_calls(Some("finding..."), vec![call], None);
1480 conv.add_tool_result(ToolResult {
1481 call_id: "c5".to_string(),
1482 output: "z".repeat(200_000), // huge
1483 success: true,
1484 });
1485
1486 // Budget too small to fit the huge output — compaction MUST still leave
1487 // at least one non-system message.
1488 let (msgs, _stats) = build_messages(&conv, "sys", 10_000, "");
1489 let non_system = msgs
1490 .iter()
1491 .filter(|m| !matches!(m.role, Role::System))
1492 .count();
1493 assert!(
1494 non_system > 0,
1495 "never return system-only result when messages exist — got msgs.len()={}",
1496 msgs.len()
1497 );
1498 }
1499
1500 #[test]
1501 fn test_budgeted_emergency_restores_last_user_when_all_else_dropped() {
1502 // Even if every turn gets dropped by some path, the emergency fallback at
1503 // the bottom of to_provider_messages_budgeted should graft back the last
1504 // user message rather than return system-only.
1505 let mut conv = Conversation::new();
1506 conv.add_user_message("original question");
1507 // Add 20 turns of huge assistant+tool content to force aggressive drop
1508 for i in 0..20 {
1509 use crate::tool::{ToolCall, ToolResult};
1510 conv.add_assistant_tool_calls(
1511 Some(&format!("reasoning {}", i)),
1512 vec![ToolCall {
1513 id: format!("c{}", i),
1514 name: "bash".to_string(),
1515 arguments: "{}".to_string(),
1516 }],
1517 None,
1518 );
1519 conv.add_tool_result(ToolResult {
1520 call_id: format!("c{}", i),
1521 output: "y".repeat(10_000),
1522 success: true,
1523 });
1524 }
1525
1526 let (msgs, _stats) = build_messages(&conv, "sys", 5_000, "");
1527 let has_user = msgs.iter().any(|m| matches!(m.role, Role::User));
1528 assert!(
1529 has_user,
1530 "last user message must always survive, got {} msgs",
1531 msgs.len()
1532 );
1533 }
1534
1535 #[test]
1536 fn microcompact_uses_generic_format_with_tool_label_from_call_id() {
1537 // microcompact emits a single generic format:
1538 // `[<tool> <ok|FAILED>: N lines, first: <line>]`. Tool label comes
1539 // from the model's own `tool_calls.name`, not a `match` on
1540 // hardcoded strings — passes the project's tech-stack-neutrality
1541 // rule. Bash, grep, glob, and unknown-tool calls all flow
1542 // through the same template.
1543 //
1544 // read_file is exempted (5-7 atomgr datalog showed weak models
1545 // build "伪自信" from `first: 205| pub async fn dynamic_connect(`
1546 // and edit blind). Skip behavior is covered by
1547 // `microcompact_skips_read_file_to_preserve_long_session_context`.
1548 //
1549 // Calls `microcompact` directly so the test isolates stub format
1550 // from the rendering pipeline's drop / compression logic.
1551 use crate::tool::{ToolCall, ToolResult};
1552
1553 let mut msgs: Vec<Message> = vec![Message::new(Role::System, "sys")];
1554 msgs.push(Message::new(Role::User, "explore"));
1555
1556 let kinds = [
1557 ("c_bok", "bash", true),
1558 ("c_bfail", "bash", false),
1559 ("c_grep", "grep", true),
1560 ("c_mcp", "mcp_remote.exec", true),
1561 ];
1562 for (id, name, success) in &kinds {
1563 msgs.push(Message {
1564 role: Role::Assistant,
1565 content: MessageContent::AssistantWithToolCalls {
1566 text: None,
1567 tool_calls: vec![ToolCall {
1568 id: (*id).to_string(),
1569 name: (*name).to_string(),
1570 arguments: "{}".into(),
1571 }],
1572 reasoning_content: None,
1573 thinking_blocks: Vec::new(),
1574 },
1575 });
1576 msgs.push(Message {
1577 role: Role::Tool,
1578 content: MessageContent::ToolResult(ToolResult {
1579 call_id: (*id).to_string(),
1580 output: format!("first line for {}\n{}", name, "x".repeat(4_000)),
1581 success: *success,
1582 }),
1583 });
1584 }
1585
1586 // Anchor the next turn so the prior tool results above are
1587 // eligible for compaction (turn-aware boundary).
1588 msgs.push(Message::new(Role::User, "now what"));
1589
1590 let n = msgs.len();
1591 // Low threshold so microcompact fires deterministically.
1592 microcompact(&mut msgs, n, 1_000);
1593
1594 let find_by_id = |id: &str| -> Option<String> {
1595 msgs.iter().find_map(|m| {
1596 if let MessageContent::ToolResult(r) = &m.content {
1597 if r.call_id == id {
1598 return Some(r.output.clone());
1599 }
1600 }
1601 None
1602 })
1603 };
1604
1605 // bash (success) → compacted with `bash ok: ...` label.
1606 let bok = find_by_id("c_bok").expect("c_bok must survive");
1607 assert!(
1608 bok.starts_with("[bash ok: ") && bok.contains("first: "),
1609 "bash success format mismatch: {}",
1610 bok
1611 );
1612
1613 // bash (failure) → `bash FAILED: ...` label preserves the
1614 // success/fail axis the model needs for retry reasoning.
1615 let bfail = find_by_id("c_bfail").expect("c_bfail must survive");
1616 assert!(
1617 bfail.starts_with("[bash FAILED: ") && bfail.contains("first: "),
1618 "bash failure format mismatch: {}",
1619 bfail
1620 );
1621
1622 // grep and an unknown tool name use the same template — no
1623 // special-case match arms inside microcompact (read_file is
1624 // exempted; see `microcompact_skips_read_file_*`).
1625 for (id, expected_label) in [
1626 ("c_grep", "grep"),
1627 ("c_mcp", "mcp_remote.exec"),
1628 ] {
1629 let body = find_by_id(id).unwrap_or_else(|| panic!("{} must survive", id));
1630 assert!(
1631 body.starts_with(&format!("[{} ok: ", expected_label)),
1632 "{} expected generic `[{} ok: ...]` format, got: {}",
1633 id,
1634 expected_label,
1635 body
1636 );
1637 assert!(
1638 body.contains("first: first line for"),
1639 "{} should preserve first-line snippet, got: {}",
1640 id,
1641 body
1642 );
1643 }
1644 }
1645
1646 /// 5-7 atomgr datalog (build 942b615): 1704/1704 bash stubs surfaced
1647 /// `first: [elapsed: Xs, exit: N]` — framework metadata, zero signal.
1648 /// Stub now skips that line and shows line 2 (the real output / real
1649 /// error). Failed bash retry decisions go from "exit 101 of unknown
1650 /// origin" to "actual error: ...".
1651 #[test]
1652 fn build_compact_stub_skips_bash_elapsed_metadata() {
1653 let bash_failure = "[elapsed: 1.9s, exit: 101]\nerror: cannot find type `Foo` in this scope";
1654 let stub = build_compact_stub("bash", bash_failure, false);
1655 assert!(
1656 stub.contains("error: cannot find type"),
1657 "bash stub must surface the actual error, not the elapsed metadata: {}",
1658 stub
1659 );
1660 assert!(
1661 !stub.contains("first: [elapsed:"),
1662 "bash stub first-line must skip the elapsed metadata: {}",
1663 stub
1664 );
1665 }
1666
1667 /// Single-line bash (`wc -l`, `echo $?`, etc.) has no line 2 to fall
1668 /// through to. Stub must use whatever line 1 is rather than blanking.
1669 #[test]
1670 fn build_compact_stub_falls_back_to_line1_when_only_one_line() {
1671 let one_liner = "42";
1672 let stub = build_compact_stub("bash", one_liner, true);
1673 assert!(stub.contains("first: 42"), "got: {}", stub);
1674 }
1675
1676 /// `[elapsed:` skip is bash-only by virtue of the prefix being unique
1677 /// to our bash tool. grep / edit_file / web_fetch outputs do NOT
1678 /// start with `[elapsed:` so they hit the normal line-1 path. This
1679 /// test pins that the skip doesn't accidentally eat the first useful
1680 /// line of those tools.
1681 #[test]
1682 fn build_compact_stub_unaffected_for_non_bash_tools() {
1683 let grep = "src/foo.rs:42: fn bar() {}\nsrc/baz.rs:10: fn baz()";
1684 let stub = build_compact_stub("grep", grep, true);
1685 assert!(
1686 stub.contains("first: src/foo.rs:42:"),
1687 "grep stub must keep line 1 intact: {}",
1688 stub
1689 );
1690
1691 let edit = "Edited /path/to/file.rs (-3 +5 lines).";
1692 let stub = build_compact_stub("edit_file", edit, true);
1693 assert!(stub.contains("first: Edited /path"), "got: {}", stub);
1694 }
1695
1696 /// 5-7 atomgr datalog (atomgr-2d99b47d/2026-05-07_00-28-34): T22-T29
1697 /// reveal weak models develop "伪自信" when read_file is stubbed —
1698 /// `[read_file ok: 115 lines, first: 205| pub async fn dynamic_connect(]`
1699 /// gives just enough surface (line number + function name) for the
1700 /// model to think it remembers the body, then it edits blind. Result:
1701 /// 6 turns of patch-and-repatch the same file. Keeping read_file
1702 /// FULL preserves attention on the actual code; D3 FileStore handles
1703 /// the disk-side cost of re-reads transparently.
1704 #[test]
1705 fn microcompact_skips_read_file_to_preserve_long_session_context() {
1706 use crate::tool::{ToolCall, ToolResult};
1707 let mut conv = Conversation::new();
1708 conv.add_user_message("explore");
1709
1710 // One read_file call with a large body — would normally be
1711 // compacted under the generic path.
1712 conv.add_assistant_tool_calls(
1713 None,
1714 vec![ToolCall {
1715 id: "c_read".into(),
1716 name: "read_file".into(),
1717 arguments: "{}".into(),
1718 }],
1719 None,
1720 );
1721 let read_body = format!("first line of read\n{}", "x".repeat(5_000));
1722 conv.add_tool_result(ToolResult {
1723 call_id: "c_read".into(),
1724 output: read_body.clone(),
1725 success: true,
1726 });
1727
1728 // Pad with bash so total_chars crosses microcompact's
1729 // threshold. Use a small budget (8K tokens → 22_400 char
1730 // threshold) so the 30 padding bashes + the read_file body
1731 // (~125K chars total) reliably triggers microcompact.
1732 for i in 0..30 {
1733 let id = format!("c_pad{}", i);
1734 conv.add_assistant_tool_calls(
1735 None,
1736 vec![ToolCall {
1737 id: id.clone(),
1738 name: "bash".into(),
1739 arguments: "{}".into(),
1740 }],
1741 None,
1742 );
1743 conv.add_tool_result(ToolResult {
1744 call_id: id,
1745 output: format!("[elapsed: 0.0s, exit: 0]\n{}", "x".repeat(4_000)),
1746 success: true,
1747 });
1748 }
1749 conv.add_user_message("now what");
1750
1751 // 40K budget → 112K char threshold. Payload (read body 5K +
1752 // 30 × 4K padding ≈ 125K chars / ~31K tokens) crosses
1753 // threshold but fits budget without triggering build_messages
1754 // pre-microcompact drops.
1755 let (msgs, _) = build_messages(&conv, "sys", 40_000, "");
1756
1757 // Locate the read_file ToolResult in the rendered messages.
1758 let body = msgs
1759 .iter()
1760 .find_map(|m| {
1761 if let MessageContent::ToolResult(r) = &m.content {
1762 if r.call_id == "c_read" {
1763 return Some(r.output.clone());
1764 }
1765 }
1766 None
1767 })
1768 .expect("c_read must survive in rendered messages");
1769
1770 // Read body must remain FULL — never replaced with the generic
1771 // `[read_file ok: ... first: ...]` stub.
1772 assert!(
1773 !body.starts_with("[read_file "),
1774 "read_file got compacted (伪自信 risk): {}",
1775 &body[..body.len().min(200)]
1776 );
1777 assert_eq!(
1778 body.len(),
1779 read_body.len(),
1780 "read_file body length must equal original (uncompacted)"
1781 );
1782 assert!(
1783 body.contains("first line of read"),
1784 "first line lost: {}",
1785 &body[..body.len().min(200)]
1786 );
1787
1788 // Sanity: bash padding ToolResults DID get compacted — confirms
1789 // the threshold actually triggered, the test isn't passing
1790 // because microcompact was a no-op.
1791 let any_bash_compacted = msgs.iter().any(|m| {
1792 if let MessageContent::ToolResult(r) = &m.content {
1793 r.output.starts_with("[bash ok: ")
1794 } else {
1795 false
1796 }
1797 });
1798 assert!(
1799 any_bash_compacted,
1800 "bash padding should have been compacted; if not, the \
1801 threshold isn't actually triggering and read_file passing \
1802 through is a false positive"
1803 );
1804 }
1805
1806 /// 5-8 atomgr session bug — microcompact was stubbing the CURRENT
1807 /// turn's earlier tool results, leading the model to echo
1808 /// `HELLO_TEST_12345` self-checks because mid-turn it could no
1809 /// longer see what it had just done. The fix: anchor on the last
1810 /// `Role::User` message in the rendered Vec — everything from
1811 /// that message onward is the active turn and stays full-fidelity.
1812 /// Only strictly older content is eligible for stubbing.
1813 ///
1814 /// Calls `microcompact` directly (not through `build_messages`) so
1815 /// the test isolates the boundary logic from the rendering
1816 /// pipeline's drop / token-budget handling.
1817 #[test]
1818 fn microcompact_preserves_current_turn_in_full() {
1819 use crate::tool::{ToolCall, ToolResult};
1820
1821 // Build a Vec<Message> manually with a clear turn boundary:
1822 // System | User#1 | (Asst tool_calls + Tool results)×15 | User#2 | (Asst+Tool)×10
1823 // Last User is User#2 → current turn is everything after it.
1824 let mut msgs: Vec<Message> = vec![Message::new(Role::System, "sys")];
1825
1826 // ── PRIOR turn ────────────────────────────────────────
1827 msgs.push(Message::new(Role::User, "first task"));
1828 for i in 0..15 {
1829 let id = format!("prior_{}", i);
1830 msgs.push(Message {
1831 role: Role::Assistant,
1832 content: MessageContent::AssistantWithToolCalls {
1833 text: None,
1834 tool_calls: vec![ToolCall {
1835 id: id.clone(),
1836 name: "bash".into(),
1837 arguments: "{}".into(),
1838 }],
1839 reasoning_content: None,
1840 thinking_blocks: Vec::new(),
1841 },
1842 });
1843 msgs.push(Message {
1844 role: Role::Tool,
1845 content: MessageContent::ToolResult(ToolResult {
1846 call_id: id,
1847 output: format!("[elapsed: 0.0s, exit: 0]\n{}", "p".repeat(4_000)),
1848 success: true,
1849 }),
1850 });
1851 }
1852
1853 // ── CURRENT turn (must stay full) ──────────────────────
1854 msgs.push(Message::new(Role::User, "second task"));
1855 for i in 0..10 {
1856 let id = format!("current_{}", i);
1857 msgs.push(Message {
1858 role: Role::Assistant,
1859 content: MessageContent::AssistantWithToolCalls {
1860 text: None,
1861 tool_calls: vec![ToolCall {
1862 id: id.clone(),
1863 name: "bash".into(),
1864 arguments: "{}".into(),
1865 }],
1866 reasoning_content: None,
1867 thinking_blocks: Vec::new(),
1868 },
1869 });
1870 msgs.push(Message {
1871 role: Role::Tool,
1872 content: MessageContent::ToolResult(ToolResult {
1873 call_id: id,
1874 output: format!("[elapsed: 0.0s, exit: 0]\n{}", "c".repeat(4_000)),
1875 success: true,
1876 }),
1877 });
1878 }
1879
1880 let total_chars: usize = msgs
1881 .iter()
1882 .map(|m| match &m.content {
1883 MessageContent::ToolResult(r) => r.output.len(),
1884 MessageContent::Text(t) => t.len(),
1885 _ => 100,
1886 })
1887 .sum();
1888 // Set threshold low so microcompact fires deterministically.
1889 let n = msgs.len();
1890 microcompact(&mut msgs, n, 1_000);
1891
1892 let collect = |prefix: &str| -> Vec<(String, String)> {
1893 msgs.iter()
1894 .filter_map(|m| match &m.content {
1895 MessageContent::ToolResult(r) if r.call_id.starts_with(prefix) => {
1896 Some((r.call_id.clone(), r.output.clone()))
1897 }
1898 _ => None,
1899 })
1900 .collect()
1901 };
1902
1903 // PRIOR turn: every tool result must be stubbed.
1904 let prior = collect("prior_");
1905 assert_eq!(prior.len(), 15, "expected 15 prior tool results");
1906 for (cid, body) in &prior {
1907 assert!(
1908 body.starts_with("[bash "),
1909 "prior turn `{}` must be stubbed; got body of len={} starting {:?}\n\
1910 (total_chars before microcompact was {})",
1911 cid,
1912 body.len(),
1913 &body[..body.len().min(80)],
1914 total_chars
1915 );
1916 assert!(
1917 body.len() < 200,
1918 "prior stub should be < 200 bytes, got {}",
1919 body.len()
1920 );
1921 }
1922
1923 // CURRENT turn: every tool result must remain FULL.
1924 let current = collect("current_");
1925 assert_eq!(current.len(), 10, "expected 10 current tool results");
1926 for (cid, body) in ¤t {
1927 assert!(
1928 !body.starts_with("[bash "),
1929 "current turn `{}` must NOT be stubbed (turn-aware preservation): \
1930 got {:?}",
1931 cid,
1932 &body[..body.len().min(80)]
1933 );
1934 assert!(
1935 body.len() > 4_000,
1936 "current tool result must keep its full payload (>4K chars), \
1937 got {} bytes",
1938 body.len()
1939 );
1940 }
1941 }
1942
1943 /// Running compaction twice MUST be idempotent — the upgraded
1944 /// microcompact's `len <= MIN_COLLAPSE_SIZE` guard ensures that
1945 /// once a stub is in place, the next pass sees a < 500-char
1946 /// result and skips it rather than re-stubbing into a less-useful
1947 /// "[older tool result collapsed (60 chars dropped)]" form
1948 /// (the bug pattern from before this unification).
1949 #[test]
1950 fn microcompact_is_idempotent_no_double_stub() {
1951 use crate::tool::{ToolCall, ToolResult};
1952 let mut conv = Conversation::new();
1953 conv.add_user_message("trigger");
1954 for i in 0..30 {
1955 let id = format!("c{}", i);
1956 conv.add_assistant_tool_calls(
1957 None,
1958 vec![ToolCall {
1959 id: id.clone(),
1960 name: "bash".into(),
1961 arguments: "{}".into(),
1962 }],
1963 None,
1964 );
1965 conv.add_tool_result(ToolResult {
1966 call_id: id,
1967 output: format!("first line\n{}", "x".repeat(4_000)),
1968 success: true,
1969 });
1970 }
1971 conv.add_user_message("done");
1972
1973 let (msgs1, _) = build_messages(&conv, "sys", 131_072, "");
1974 let (msgs2, _) = build_messages(&conv, "sys", 131_072, "");
1975
1976 // Compaction is pure over (conv, threshold) — two passes must
1977 // produce byte-identical compacted bodies, no degradation.
1978 let collect_tr = |m: &[Message]| -> Vec<String> {
1979 m.iter()
1980 .filter_map(|m| {
1981 if let MessageContent::ToolResult(r) = &m.content {
1982 Some(r.output.clone())
1983 } else {
1984 None
1985 }
1986 })
1987 .collect()
1988 };
1989 assert_eq!(collect_tr(&msgs1), collect_tr(&msgs2));
1990 // And concretely: every stub stays in `[bash ok: ...]` form,
1991 // never devolves into `[older tool result collapsed ...]`.
1992 for body in collect_tr(&msgs1) {
1993 if body.starts_with("[bash") {
1994 assert!(
1995 body.contains("first: "),
1996 "stub lost its first-line slot: {}",
1997 body
1998 );
1999 }
2000 }
2001 }
2002
2003 #[test]
2004 fn test_cold_zone_compression() {
2005 use crate::tool::{ToolCall, ToolResult};
2006 let mut conv = Conversation::new();
2007
2008 // Create 8 turns
2009 for turn in 0..8 {
2010 conv.add_user_message(&format!("task {}", turn));
2011 let call = ToolCall {
2012 id: format!("c{}", turn),
2013 name: "bash".to_string(),
2014 arguments: "{}".to_string(),
2015 };
2016 conv.add_assistant_tool_calls(Some("ok"), vec![call], None);
2017 conv.add_tool_result(ToolResult {
2018 call_id: format!("c{}", turn),
2019 output: "x".repeat(100),
2020 success: true,
2021 });
2022 }
2023
2024 // Apply compression: remove first 9 messages (3 turns × 3 msgs each)
2025 conv.apply_compression(9, "User ran tasks 0, 1, 2 with bash.".to_string());
2026
2027 // Cold zone should have 1 entry
2028 assert_eq!(conv.cold_summaries.len(), 1);
2029 // Messages should be reduced (first 3 turns removed)
2030 assert_eq!(conv.turn_tracker.turns.len(), 5); // 8 - 3
2031
2032 // Budget check: cold zone should appear in output
2033 let (msgs, _stats) = build_messages(&conv, "sys", 100000, "");
2034 let has_cold = msgs.iter().any(|m| {
2035 m.text()
2036 .map_or(false, |t| t.contains("Earlier conversation history"))
2037 });
2038 assert!(has_cold, "Cold zone summary should appear in output");
2039 }
2040
2041 /// Regression: MiniMax-M2.7 returns empty content + 400 (`2013 invalid
2042 /// chat setting`) when the request contains adjacent `system` messages.
2043 /// Post-compression layout used to ship `system(orig) + system(cold-zone)`
2044 /// straight to the wire — `clean_message_pipeline` now coalesces them.
2045 #[test]
2046 fn test_no_consecutive_system_messages_after_compression() {
2047 use crate::tool::{ToolCall, ToolResult};
2048 let mut conv = Conversation::new();
2049
2050 for turn in 0..8 {
2051 conv.add_user_message(&format!("task {}", turn));
2052 let call = ToolCall {
2053 id: format!("c{}", turn),
2054 name: "bash".to_string(),
2055 arguments: "{}".to_string(),
2056 };
2057 conv.add_assistant_tool_calls(Some("ok"), vec![call], None);
2058 conv.add_tool_result(ToolResult {
2059 call_id: format!("c{}", turn),
2060 output: "x".repeat(100),
2061 success: true,
2062 });
2063 }
2064
2065 conv.apply_compression(9, "User ran tasks 0, 1, 2 with bash.".to_string());
2066 assert_eq!(conv.cold_summaries.len(), 1);
2067
2068 let (msgs, _stats) = build_messages(&conv, "you are atomcode", 100_000, "");
2069
2070 for pair in msgs.windows(2) {
2071 assert!(
2072 !(pair[0].role == Role::System && pair[1].role == Role::System),
2073 "consecutive system messages found at the wire boundary"
2074 );
2075 }
2076
2077 // The merged system message must still carry both the original
2078 // prompt and the cold-zone summary so the model retains context.
2079 let merged = msgs
2080 .iter()
2081 .find(|m| matches!(m.role, Role::System))
2082 .and_then(|m| m.text())
2083 .expect("at least one system message");
2084 assert!(
2085 merged.contains("you are atomcode"),
2086 "merged system must keep original prompt"
2087 );
2088 assert!(
2089 merged.contains("Earlier conversation history"),
2090 "merged system must keep cold-zone summary"
2091 );
2092 }
2093
2094 #[test]
2095 fn test_budgeted_drops_when_no_summary_and_over_budget() {
2096 use crate::tool::{ToolCall, ToolResult};
2097 let mut conv = Conversation::new();
2098
2099 // Create 3 turns with large content (no summaries)
2100 for turn in 0..3 {
2101 conv.add_user_message(&format!("task {}", turn));
2102 let call = ToolCall {
2103 id: format!("c{}", turn),
2104 name: "bash".to_string(),
2105 arguments: "{}".to_string(),
2106 };
2107 conv.add_assistant_tool_calls(Some("ok"), vec![call], None);
2108 conv.add_tool_result(ToolResult {
2109 call_id: format!("c{}", turn),
2110 output: "x".repeat(4000),
2111 success: true,
2112 });
2113 }
2114
2115 // Small budget — force dropping
2116 let (msgs, stats) = build_messages(&conv, "sys", 2000, "");
2117 assert!(
2118 stats.dropped_tokens > 0,
2119 "Should drop turns when over budget"
2120 );
2121 assert!(matches!(msgs[0].role, Role::System));
2122 }
2123
2124 /// Bug b regression: after compression has run once, `cold_summaries`
2125 /// is non-empty, which disables the 80% drop cap above (legacy
2126 /// pathology guard). Microcompact still skips the last
2127 /// `OTHER_KEEP=20` messages. That leaves the recent window with no
2128 /// byte enforcement, so many mid-sized ToolResults can blow budget.
2129 /// The final post-cleanup byte ceiling must condense oldest
2130 /// ToolResults in `result` until total estimated tokens fit under
2131 /// 80% of the budget.
2132 #[test]
2133 fn test_final_byte_ceiling_condenses_oversized_recent_toolresults() {
2134 use crate::tool::{ToolCall, ToolResult};
2135 let mut conv = Conversation::new();
2136 // Mark that a prior compression already ran — cold_summaries
2137 // non-empty is the precondition that disables the earlier cap.
2138 conv.cold_summaries.push("earlier task summary".to_string());
2139
2140 // 20 turns, each with a 6K-char bash result. microcompact's
2141 // OTHER_KEEP=20 leaves the trailing 20 messages (≈ last 6-7 turns)
2142 // untouched — those alone sum to > 36K chars ≈ 9K+ est tokens,
2143 // which exceeds the 80% ceiling of the chosen budget.
2144 for turn in 0..20 {
2145 conv.add_user_message(&format!("task {}", turn));
2146 conv.add_assistant_tool_calls(
2147 Some("ok"),
2148 vec![ToolCall {
2149 id: format!("c{}", turn),
2150 name: "bash".to_string(),
2151 arguments: "{}".to_string(),
2152 }],
2153 None,
2154 );
2155 conv.add_tool_result(ToolResult {
2156 call_id: format!("c{}", turn),
2157 output: "x".repeat(6000),
2158 success: true,
2159 });
2160 }
2161
2162 // token_budget = 10K tokens → ceiling = 8K tokens.
2163 let (msgs, _stats) = build_messages(&conv, "sys", 10_000, "");
2164 let total_tokens: usize = msgs.iter().map(|m| m.estimate_tokens()).sum();
2165 assert!(
2166 total_tokens <= 8_000,
2167 "Total estimated tokens {} exceeded 80% ceiling 8000 — \
2168 final byte ceiling did not run",
2169 total_tokens,
2170 );
2171 // The newest turn's tool result must survive in full (not condensed).
2172 let newest_still_full = msgs
2173 .iter()
2174 .any(|m| m.text().map_or(false, |t| t.contains(&"x".repeat(100))));
2175 assert!(
2176 newest_still_full,
2177 "Newest turn's full-size tool result must be preserved",
2178 );
2179 }
2180
2181 #[test]
2182 fn test_budgeted_preserves_message_order() {
2183 let mut conv = Conversation::new();
2184 conv.add_user_message("first");
2185 conv.messages
2186 .push(Message::new(Role::Assistant, "response 1"));
2187 conv.add_user_message("second");
2188 conv.messages
2189 .push(Message::new(Role::Assistant, "response 2"));
2190 conv.add_user_message("third");
2191
2192 let (msgs, _stats) = build_messages(&conv, "sys", 100000, "");
2193 // system + 5 messages
2194 assert_eq!(msgs.len(), 6);
2195 assert_eq!(msgs[1].text(), Some("first"));
2196 assert_eq!(msgs[2].text(), Some("response 1"));
2197 assert_eq!(msgs[3].text(), Some("second"));
2198 assert_eq!(msgs[4].text(), Some("response 2"));
2199 assert_eq!(msgs[5].text(), Some("third"));
2200 }
2201
2202 #[test]
2203 fn test_sanitize_removes_orphan_tool_results() {
2204 use crate::tool::ToolResult;
2205 let mut msgs = vec![
2206 Message::new(Role::System, "sys"),
2207 // Orphan tool result (no matching AssistantWithToolCalls)
2208 Message {
2209 role: Role::Tool,
2210 content: MessageContent::ToolResult(ToolResult {
2211 call_id: "orphan_1".to_string(),
2212 output: "some output".to_string(),
2213 success: true,
2214 }),
2215 },
2216 Message::new(Role::User, "hello"),
2217 ];
2218 sanitize_messages(&mut msgs);
2219 // Orphan should be removed, leaving System + User
2220 assert_eq!(msgs.len(), 2);
2221 assert!(matches!(msgs[0].role, Role::System));
2222 assert!(matches!(msgs[1].role, Role::User));
2223 }
2224
2225 #[test]
2226 fn test_sanitize_preserves_valid_pairs() {
2227 use crate::tool::{ToolCall, ToolResult};
2228 let mut msgs = vec![
2229 Message::new(Role::System, "sys"),
2230 Message::new(Role::User, "do it"),
2231 Message {
2232 role: Role::Assistant,
2233 content: MessageContent::AssistantWithToolCalls {
2234 text: None,
2235 tool_calls: vec![ToolCall {
2236 id: "c1".to_string(),
2237 name: "bash".to_string(),
2238 arguments: "{}".to_string(),
2239 }],
2240 reasoning_content: None,
2241 thinking_blocks: Vec::new(),
2242 },
2243 },
2244 Message {
2245 role: Role::Tool,
2246 content: MessageContent::ToolResult(ToolResult {
2247 call_id: "c1".to_string(),
2248 output: "ok".to_string(),
2249 success: true,
2250 }),
2251 },
2252 ];
2253 sanitize_messages(&mut msgs);
2254 // All 4 messages should be preserved (valid pair)
2255 assert_eq!(msgs.len(), 4);
2256 }
2257
2258 /// Regression for DeepSeek `insufficient tool messages following
2259 /// tool_calls message` 400. An assistant emitted N=3 tool_calls but
2260 /// only 2 ToolResults arrived before a User text message — the third
2261 /// call_id never gets a tool message, and strict providers reject.
2262 /// Sanitize must drop the offending ATC + its partial results so the
2263 /// surviving prefix preserves the wire-level invariant.
2264 #[test]
2265 fn test_sanitize_drops_under_paired_atc_in_middle_of_history() {
2266 use crate::tool::{ToolCall, ToolResult};
2267 let mut msgs = vec![
2268 Message::new(Role::System, "sys"),
2269 Message::new(Role::User, "first"),
2270 Message {
2271 role: Role::Assistant,
2272 content: MessageContent::AssistantWithToolCalls {
2273 text: None,
2274 tool_calls: vec![
2275 ToolCall {
2276 id: "c1".into(),
2277 name: "bash".into(),
2278 arguments: "{}".into(),
2279 },
2280 ToolCall {
2281 id: "c2".into(),
2282 name: "bash".into(),
2283 arguments: "{}".into(),
2284 },
2285 ToolCall {
2286 id: "c3".into(),
2287 name: "bash".into(),
2288 arguments: "{}".into(),
2289 },
2290 ],
2291 reasoning_content: None,
2292 thinking_blocks: Vec::new(),
2293 },
2294 },
2295 Message {
2296 role: Role::Tool,
2297 content: MessageContent::ToolResult(ToolResult {
2298 call_id: "c1".into(),
2299 output: "ok1".into(),
2300 success: true,
2301 }),
2302 },
2303 Message {
2304 role: Role::Tool,
2305 content: MessageContent::ToolResult(ToolResult {
2306 call_id: "c2".into(),
2307 output: "ok2".into(),
2308 success: true,
2309 }),
2310 },
2311 // c3 result MISSING — the source of the 400.
2312 Message::new(Role::User, "second"),
2313 ];
2314 sanitize_messages(&mut msgs);
2315 // ATC + 2 partial results gone; surviving = sys + user1 + user2.
2316 assert_eq!(msgs.len(), 3, "got: {:?}", msgs);
2317 assert!(matches!(msgs[0].role, Role::System));
2318 assert_eq!(msgs[1].text(), Some("first"));
2319 assert_eq!(msgs[2].text(), Some("second"));
2320 }
2321
2322 /// Same situation as above, but the boundary is a *next* ATC instead
2323 /// of a Text message. The first (under-paired) ATC and its partial
2324 /// results must be dropped; the second (well-paired) ATC stays.
2325 #[test]
2326 fn test_sanitize_drops_under_paired_atc_when_followed_by_another_atc() {
2327 use crate::tool::{ToolCall, ToolResult};
2328 let mut msgs = vec![
2329 Message::new(Role::User, "go"),
2330 Message {
2331 role: Role::Assistant,
2332 content: MessageContent::AssistantWithToolCalls {
2333 text: None,
2334 tool_calls: vec![
2335 ToolCall {
2336 id: "a1".into(),
2337 name: "bash".into(),
2338 arguments: "{}".into(),
2339 },
2340 ToolCall {
2341 id: "a2".into(),
2342 name: "bash".into(),
2343 arguments: "{}".into(),
2344 },
2345 ],
2346 reasoning_content: None,
2347 thinking_blocks: Vec::new(),
2348 },
2349 },
2350 Message {
2351 role: Role::Tool,
2352 content: MessageContent::ToolResult(ToolResult {
2353 call_id: "a1".into(),
2354 output: "ok".into(),
2355 success: true,
2356 }),
2357 },
2358 // a2 missing.
2359 Message {
2360 role: Role::Assistant,
2361 content: MessageContent::AssistantWithToolCalls {
2362 text: None,
2363 tool_calls: vec![ToolCall {
2364 id: "b1".into(),
2365 name: "bash".into(),
2366 arguments: "{}".into(),
2367 }],
2368 reasoning_content: None,
2369 thinking_blocks: Vec::new(),
2370 },
2371 },
2372 Message {
2373 role: Role::Tool,
2374 content: MessageContent::ToolResult(ToolResult {
2375 call_id: "b1".into(),
2376 output: "ok".into(),
2377 success: true,
2378 }),
2379 },
2380 ];
2381 sanitize_messages(&mut msgs);
2382 // First ATC + a1 result removed; second ATC + b1 result kept.
2383 assert_eq!(msgs.len(), 3, "got: {:?}", msgs);
2384 assert_eq!(msgs[0].text(), Some("go"));
2385 assert!(matches!(
2386 msgs[1].content,
2387 MessageContent::AssistantWithToolCalls { .. }
2388 ));
2389 assert!(matches!(msgs[2].content, MessageContent::ToolResult(_)));
2390 }
2391
2392 /// Trailing under-paired ATC (no Text / next ATC after it) is the
2393 /// case the original sanitize already handled. Pinning it here so
2394 /// the new mid-history logic doesn't accidentally regress the tail
2395 /// path.
2396 #[test]
2397 fn test_sanitize_drops_under_paired_atc_at_tail() {
2398 use crate::tool::{ToolCall, ToolResult};
2399 let mut msgs = vec![
2400 Message::new(Role::User, "go"),
2401 Message {
2402 role: Role::Assistant,
2403 content: MessageContent::AssistantWithToolCalls {
2404 text: None,
2405 tool_calls: vec![
2406 ToolCall {
2407 id: "c1".into(),
2408 name: "bash".into(),
2409 arguments: "{}".into(),
2410 },
2411 ToolCall {
2412 id: "c2".into(),
2413 name: "bash".into(),
2414 arguments: "{}".into(),
2415 },
2416 ],
2417 reasoning_content: None,
2418 thinking_blocks: Vec::new(),
2419 },
2420 },
2421 Message {
2422 role: Role::Tool,
2423 content: MessageContent::ToolResult(ToolResult {
2424 call_id: "c1".into(),
2425 output: "ok".into(),
2426 success: true,
2427 }),
2428 },
2429 // c2 missing, conversation ends here.
2430 ];
2431 sanitize_messages(&mut msgs);
2432 // ATC + 1 partial result both removed; just the user message remains.
2433 assert_eq!(msgs.len(), 1);
2434 assert_eq!(msgs[0].text(), Some("go"));
2435 }
2436
2437 /// Negative control: when every ATC's tool_calls are fully paired,
2438 /// nothing must be removed even though the new mid-history logic
2439 /// runs over Text boundaries. Catches "fix that throws away valid
2440 /// history" regressions.
2441 #[test]
2442 fn test_sanitize_preserves_fully_paired_history_through_text_boundaries() {
2443 use crate::tool::{ToolCall, ToolResult};
2444 let mut msgs = vec![
2445 Message::new(Role::User, "first"),
2446 Message {
2447 role: Role::Assistant,
2448 content: MessageContent::AssistantWithToolCalls {
2449 text: None,
2450 tool_calls: vec![
2451 ToolCall {
2452 id: "c1".into(),
2453 name: "bash".into(),
2454 arguments: "{}".into(),
2455 },
2456 ToolCall {
2457 id: "c2".into(),
2458 name: "bash".into(),
2459 arguments: "{}".into(),
2460 },
2461 ],
2462 reasoning_content: None,
2463 thinking_blocks: Vec::new(),
2464 },
2465 },
2466 Message {
2467 role: Role::Tool,
2468 content: MessageContent::ToolResult(ToolResult {
2469 call_id: "c1".into(),
2470 output: "ok1".into(),
2471 success: true,
2472 }),
2473 },
2474 Message {
2475 role: Role::Tool,
2476 content: MessageContent::ToolResult(ToolResult {
2477 call_id: "c2".into(),
2478 output: "ok2".into(),
2479 success: true,
2480 }),
2481 },
2482 Message::new(Role::Assistant, "done"),
2483 Message::new(Role::User, "second"),
2484 ];
2485 let len_before = msgs.len();
2486 sanitize_messages(&mut msgs);
2487 assert_eq!(msgs.len(), len_before, "must not drop fully-paired history");
2488 }
2489
2490 /// End-to-end regression for the DeepSeek `insufficient tool
2491 /// messages following tool_calls message` 400 via the main
2492 /// turn-tracked `build_messages` path. The function-level
2493 /// `sanitize_messages` tests cover the unit; this test pins the
2494 /// wiring — sanitize_messages must run from `build_messages`, not
2495 /// just from the fallback. Constructs a Conversation with a
2496 /// turn-bearing under-paired ATC mid-history (ATC(3) + only 2
2497 /// tool_results, then a fresh user turn) and verifies the wire-
2498 /// level invariant holds in the output: every surviving ATC is
2499 /// followed by exactly N tool messages.
2500 #[test]
2501 fn build_messages_satisfies_atc_pairing_after_under_paired_mid_history() {
2502 use crate::tool::{ToolCall, ToolResult};
2503 let mut conv = Conversation::new();
2504 conv.add_user_message("first task");
2505 conv.add_assistant_tool_calls(
2506 None,
2507 vec![
2508 ToolCall { id: "c1".into(), name: "bash".into(), arguments: "{}".into() },
2509 ToolCall { id: "c2".into(), name: "bash".into(), arguments: "{}".into() },
2510 ToolCall { id: "c3".into(), name: "bash".into(), arguments: "{}".into() },
2511 ],
2512 None,
2513 );
2514 conv.add_tool_result(ToolResult {
2515 call_id: "c1".into(),
2516 output: "ok1".into(),
2517 success: true,
2518 });
2519 conv.add_tool_result(ToolResult {
2520 call_id: "c2".into(),
2521 output: "ok2".into(),
2522 success: true,
2523 });
2524 // c3's ToolResult never lands — repro for DeepSeek 400.
2525 conv.add_user_message("second task");
2526
2527 let (msgs, _stats) = build_messages(&conv, "sys", 8000, "");
2528
2529 // Walk the result and assert every ATC is followed by exactly
2530 // N consecutive tool-role messages — the wire invariant
2531 // OpenAI / DeepSeek / Claude / Gemini all require.
2532 let mut i = 0;
2533 while i < msgs.len() {
2534 if let MessageContent::AssistantWithToolCalls { tool_calls, .. } = &msgs[i].content {
2535 let n = tool_calls.len();
2536 for j in 0..n {
2537 let next_idx = i + 1 + j;
2538 assert!(
2539 next_idx < msgs.len(),
2540 "ATC at {} expects {} tool_results but messages end at {}: {:?}",
2541 i,
2542 n,
2543 msgs.len(),
2544 msgs.iter().map(|m| &m.role).collect::<Vec<_>>()
2545 );
2546 assert!(
2547 matches!(
2548 msgs[next_idx].content,
2549 MessageContent::ToolResult(_) | MessageContent::ToolResultRef(_)
2550 ),
2551 "ATC at {} expects tool_result at {} but found {:?}",
2552 i,
2553 next_idx,
2554 msgs[next_idx].role
2555 );
2556 }
2557 i += 1 + n;
2558 } else {
2559 i += 1;
2560 }
2561 }
2562
2563 // Defensive: the orphan c3 must NOT appear as a tool_call_id
2564 // anywhere in the output (the under-paired ATC was dropped, so
2565 // c1 and c2 are gone with it).
2566 for m in &msgs {
2567 if let MessageContent::AssistantWithToolCalls { tool_calls, .. } = &m.content {
2568 for tc in tool_calls {
2569 assert_ne!(tc.id, "c3", "dropped ATC's call_ids must not survive");
2570 assert_ne!(tc.id, "c1");
2571 assert_ne!(tc.id, "c2");
2572 }
2573 }
2574 if let MessageContent::ToolResult(r) = &m.content {
2575 assert_ne!(r.call_id, "c1", "partial tool_results must not survive");
2576 assert_ne!(r.call_id, "c2");
2577 }
2578 }
2579 }
2580
2581 /// Regression: `microcompact` gate tied to `threshold_chars`.
2582 ///
2583 /// Before: hardcoded `total_chars < 100_000` meant any ctx with a
2584 /// real budget under ~25K tokens (Ollama at 8K) could never hit
2585 /// the gate — per-model `tool_output_cap` optimization was silently
2586 /// neutralized. Now the threshold is passed in; small-window ctx
2587 /// passes a proportionally smaller value.
2588 #[test]
2589 fn microcompact_respects_threshold_parameter() {
2590 use crate::tool::{ToolCall, ToolResult};
2591
2592 // Build 25 turns each with a 1000-char bash result. Total
2593 // tool-result bytes ≈ 25_000 — well below the old 100K gate
2594 // but above a 10K gate.
2595 fn build_msgs() -> Vec<Message> {
2596 let mut msgs = vec![Message::new(Role::System, "sys")];
2597 for i in 0..25 {
2598 msgs.push(Message::new(Role::User, format!("task {}", i)));
2599 msgs.push(Message {
2600 role: Role::Assistant,
2601 content: MessageContent::AssistantWithToolCalls {
2602 text: None,
2603 tool_calls: vec![ToolCall {
2604 id: format!("c{}", i),
2605 name: "bash".to_string(),
2606 arguments: "{}".to_string(),
2607 }],
2608 reasoning_content: None,
2609 thinking_blocks: Vec::new(),
2610 },
2611 });
2612 msgs.push(Message {
2613 role: Role::Tool,
2614 content: MessageContent::ToolResult(ToolResult {
2615 call_id: format!("c{}", i),
2616 output: "x".repeat(1000),
2617 success: true,
2618 }),
2619 });
2620 }
2621 msgs
2622 }
2623
2624 fn total_tool_bytes(msgs: &[Message]) -> usize {
2625 msgs.iter()
2626 .map(|m| match &m.content {
2627 MessageContent::ToolResult(r) => r.output.len(),
2628 _ => 0,
2629 })
2630 .sum()
2631 }
2632
2633 // High threshold (100K) → total 25K < 100K → no-op.
2634 let mut msgs_high = build_msgs();
2635 let before_high_len = msgs_high.len();
2636 let before_high_bytes = total_tool_bytes(&msgs_high);
2637 let msg_count_high = msgs_high.len();
2638 microcompact(&mut msgs_high, msg_count_high, 100_000);
2639 assert_eq!(
2640 msgs_high.len(),
2641 before_high_len,
2642 "high-threshold run must not drop msgs"
2643 );
2644 assert_eq!(
2645 total_tool_bytes(&msgs_high),
2646 before_high_bytes,
2647 "high threshold (25K < 100K) must leave tool_result bytes untouched"
2648 );
2649
2650 // Low threshold (10K) → total 25K >= 10K → microcompact kicks
2651 // in and shrinks older ToolResults.
2652 let mut msgs_low = build_msgs();
2653 let before_low_bytes = total_tool_bytes(&msgs_low);
2654 let msg_count_low = msgs_low.len();
2655 microcompact(&mut msgs_low, msg_count_low, 10_000);
2656 let after_low_bytes = total_tool_bytes(&msgs_low);
2657 assert!(
2658 after_low_bytes < before_low_bytes,
2659 "low threshold (25K > 10K) must shrink tool_result bytes, before={} after={}",
2660 before_low_bytes,
2661 after_low_bytes
2662 );
2663 }
2664
2665 /// Regression: `build_compression_content` must not cut between an
2666 /// `AssistantWithToolCalls` and its trailing `ToolResult`(s). Cutting
2667 /// mid-pair leaves orphan tool_results which `clean_message_pipeline`
2668 /// silently drops — the model loses edit confirmations. Anthropic API
2669 /// also rejects orphan tool_results.
2670 ///
2671 /// Construct a conversation where the naive cut index
2672 /// (`len - KEEP_MESSAGES`) lands on a ToolResult whose paired ATC
2673 /// sits in the drop range. Verify the returned cut index skips past
2674 /// ALL trailing ToolResults so no orphan survives.
2675 #[test]
2676 fn compression_cut_never_splits_tool_use_result_pair() {
2677 use crate::tool::{ToolCall, ToolResult};
2678
2679 // Helper: build a conv where messages[cut_idx] = ToolResult
2680 // with its ATC at messages[cut_idx - 1] (in drop range).
2681 let build_conv = || {
2682 let mut conv = Conversation::new();
2683
2684 // Pad with plain text turns until we reach the position where
2685 // the problematic tool pair will land.
2686 // KEEP_MESSAGES = 20. We want naive_cut = len - 20 to hit a
2687 // ToolResult. If we put ATC at msg[N-21] and ToolResult at
2688 // msg[N-20], then `conv.len() = N`, `naive_cut = N-20` →
2689 // lands on the ToolResult. ✓
2690 //
2691 // Put a text-only prefix of 20 messages, then ATC+ToolResult,
2692 // then another 20 text-only suffix → len = 42, naive_cut = 22
2693 // which SHOULD be the ToolResult we planted.
2694
2695 for i in 0..10 {
2696 conv.add_user_message(&format!("prefix task {}", i));
2697 conv.push_delta(&format!("prefix reply {}", i));
2698 conv.finalize_stream();
2699 }
2700 // After 10 text turns: 20 messages.
2701
2702 // Position 20 would be the next user msg. But we want ATC here
2703 // (msg[20]) and ToolResult at msg[21]. Problem: ATC must be
2704 // preceded by a User in a normal turn. Use a real tool round.
2705 conv.add_user_message("trigger tool"); // msg[20]
2706 conv.add_assistant_tool_calls(
2707 Some("r"),
2708 vec![ToolCall {
2709 // msg[21]
2710 id: "call_would_orphan".to_string(),
2711 name: "bash".to_string(),
2712 arguments: "{}".to_string(),
2713 }],
2714 None,
2715 );
2716 conv.add_tool_result(ToolResult {
2717 // msg[22]
2718 call_id: "call_would_orphan".to_string(),
2719 output: "tool output that must not be lost".to_string(),
2720 success: true,
2721 });
2722 // After the tool round: 23 messages.
2723
2724 // Suffix: pad with text turns so len - KEEP_MESSAGES = 22.
2725 // Need len = 42. Currently 23. Add 19 more → 42.
2726 // Adding in user/assistant pairs: 19/2 = 9 full + 1 extra.
2727 for i in 0..9 {
2728 conv.add_user_message(&format!("suffix task {}", i));
2729 conv.push_delta(&format!("suffix reply {}", i));
2730 conv.finalize_stream();
2731 }
2732 // 23 + 18 = 41. Add one more user message.
2733 conv.add_user_message("final task");
2734 conv
2735 };
2736
2737 let conv = build_conv();
2738 let len = conv.messages.len();
2739 assert_eq!(len, 42, "conv layout wrong");
2740
2741 let naive_cut = len - KEEP_MESSAGES;
2742 assert_eq!(naive_cut, 22);
2743 // Confirm msg[22] is indeed the ToolResult we planted.
2744 assert!(
2745 matches!(conv.messages[22].content, MessageContent::ToolResult(_)),
2746 "test layout broken: msg[22] should be ToolResult"
2747 );
2748
2749 // Now query the real fn. Fix guarantees the cut index points at
2750 // a position that is NOT a ToolResult (advanced past trailing
2751 // ToolResults so no orphan survives).
2752 let (_summary, actual_cut) = build_compression_content(&conv);
2753
2754 if actual_cut < conv.messages.len() {
2755 let first_survivor = &conv.messages[actual_cut];
2756 let is_tool_result = matches!(
2757 first_survivor.content,
2758 MessageContent::ToolResult(_) | MessageContent::ToolResultRef(_)
2759 );
2760 assert!(
2761 !is_tool_result,
2762 "cut index {} lands on ToolResult (naive was {}); \
2763 surviving range would start with orphan",
2764 actual_cut, naive_cut
2765 );
2766 }
2767
2768 // Applied-cut invariant: after draining [..actual_cut], every
2769 // surviving ToolResult has its paired ATC in the surviving range.
2770 let mut c2 = build_conv();
2771 c2.apply_compression(actual_cut, "summary".to_string());
2772
2773 let mut live_call_ids = std::collections::HashSet::<String>::new();
2774 for msg in &c2.messages {
2775 match &msg.content {
2776 MessageContent::AssistantWithToolCalls { tool_calls, .. } => {
2777 for tc in tool_calls {
2778 live_call_ids.insert(tc.id.clone());
2779 }
2780 }
2781 MessageContent::ToolResult(r) => {
2782 assert!(
2783 live_call_ids.contains(&r.call_id),
2784 "orphan ToolResult({}) in surviving range — its ATC was dropped",
2785 r.call_id
2786 );
2787 }
2788 _ => {}
2789 }
2790 }
2791 }
2792
2793 /// Conversation compression is correct only when it reduces the next
2794 /// wire payload. A generated summary (plus any post-compress state note)
2795 /// can be larger than the messages it replaces, so callers must judge
2796 /// compression by before/after `build_messages` tokens, not by raw
2797 /// history length.
2798 #[test]
2799 fn compression_must_be_judged_by_wire_token_savings() {
2800 let mut conv = Conversation::new();
2801 for i in 0..16 {
2802 conv.add_user_message(&format!("task {}", i));
2803 conv.push_delta("ok");
2804 conv.finalize_stream();
2805 }
2806
2807 let before_tokens: usize = build_messages(&conv, "sys", 64_000, "")
2808 .0
2809 .iter()
2810 .map(|m| m.estimate_tokens())
2811 .sum();
2812 let (_mechanical_summary, remove_count) = build_compression_content(&conv);
2813 assert!(remove_count > 0, "test conversation should be compressible");
2814
2815 conv.apply_compression(remove_count, "expanded summary ".repeat(2_000));
2816 conv.add_user_message(
2817 "[Context was compressed. Here is your current state:]\n\
2818 TASK: continue the current issue analysis\n\
2819 RECENTLY READ: crates/atomcode-core/src/agent/mod.rs",
2820 );
2821
2822 let after_tokens: usize = build_messages(&conv, "sys", 64_000, "")
2823 .0
2824 .iter()
2825 .map(|m| m.estimate_tokens())
2826 .sum();
2827
2828 assert!(
2829 after_tokens > before_tokens,
2830 "dropped messages alone is not a valid compaction success metric: \
2831 before={before_tokens}, after={after_tokens}"
2832 );
2833 }
2834
2835}