Skip to main content

harness/
compaction.rs

1//! Context-window-aware compaction.
2//!
3//! Long-running native sessions accumulate a `messages` history that
4//! eventually exceeds the model's context window. Without intervention
5//! the next turn either truncates server-side (silently losing context)
6//! or fails with a 400 / `context_length_exceeded`. Compaction folds the
7//! mid-conversation into a `<conversation-summary>` checkpoint while
8//! preserving all true user messages verbatim within a token budget —
9//! same idea as Codex local compaction.
10//!
11//! Strategy contract (`CompactionStrategy`):
12//!   * `should_compact` — pure boolean gate; the agent loop checks this
13//!     each step BEFORE building the next model request.
14//!   * `compact` — best-effort; returns a fresh `Vec<ChatMessage>` that
15//!     replaces the running history. Caller emits a separate event so
16//!     `native_adapter` can mark the boundary on the wire.
17//!
18//! Producer of compaction:
19//!   * `agent_loop::run_loop` constructs `SummarizeCompactionStrategy` per
20//!     turn (cheap — no internal state) and calls `should_compact` /
21//!     `compact` between steps.
22//!
23//! The strategy reuses the session's primary `ModelClient` to produce
24//! the summary; that keeps cache keys hot on the prefix shared with the
25//! main agent call (Anthropic prompt cache hits straight through).
26
27use async_trait::async_trait;
28use std::sync::Arc;
29
30use crate::model::{
31    collect_model_response, ChatMessage, ModelClient, ModelClientError, ModelResponse,
32    ModelTurnInput,
33};
34use crate::tools::ToolSpec;
35
36/// Fraction of the model's context window above which compaction fires.
37/// 0.90 leaves a ~10 % headroom for the in-flight turn's tool results and
38/// the model's reply, matching Codex's default threshold.
39pub const DEFAULT_TRIGGER_FRACTION: f64 = 0.90;
40
41/// Minimum number of messages the history must contain before compaction
42/// is allowed to run. Below this floor the history is too short to benefit
43/// from compaction and the call is a no-op.
44pub const DEFAULT_TAIL_MIN_MESSAGES: usize = 4;
45
46/// Soft cap on summary length when serialised back into a `User`
47/// `<conversation-summary>` block. Above this we let the model decide,
48/// but pass `max_tokens` hint so it doesn't ramble. 2 000 ≈ 8 KB —
49/// enough for most multi-turn dialogues; OMA uses the same default.
50pub const DEFAULT_SUMMARY_MAX_TOKENS: i32 = 2_000;
51
52/// Token budget for verbatim user-message retention in the replacement
53/// history. All true user messages are collected from the full history
54/// (oldest to newest) and as many as fit within this budget are kept
55/// verbatim — newest first. Matches Codex's `COMPACT_USER_MESSAGE_MAX_TOKENS`.
56pub const DEFAULT_USER_MESSAGE_TOKEN_BUDGET: u64 = 20_000;
57
58/// Mixed-script token estimator:
59/// ASCII runs compress at ~4 chars/token, while CJK and other non-ASCII
60/// text tokenizes at ≈1 token per char on every modern BPE vocabulary. The
61/// previous flat `bytes / 4` heuristic underestimated Chinese text ~3×
62/// (one CJK char = 3 UTF-8 bytes → counted as 0.75 tokens instead of ~1),
63/// so compaction fired far too late on Chinese-heavy conversations.
64/// Whitespace-only input costs 0. Still an estimator — not for billing.
65pub fn estimate_tokens(s: &str) -> u64 {
66    if s.trim().is_empty() {
67        return 0;
68    }
69    let mut ascii: u64 = 0;
70    let mut non_ascii: u64 = 0;
71    for c in s.chars() {
72        if c.is_ascii() {
73            ascii += 1;
74        } else {
75            non_ascii += 1;
76        }
77    }
78    ascii.div_ceil(4) + non_ascii
79}
80
81/// Per-message estimate: `estimate_tokens` over every text part, plus
82/// small fixed overheads for structural wrapping (tool-call envelope ≈ 8
83/// tokens, tool-result envelope ≈ 16 — same budgets as the old byte-based
84/// +32 / +64 at 4 bytes/token). Floors at 1 so empty messages still cost.
85pub fn estimate_chat_message_tokens(m: &ChatMessage) -> u64 {
86    let tokens = match m {
87        ChatMessage::User { content, .. } => estimate_tokens(content),
88        ChatMessage::Assistant {
89            text,
90            tool_calls,
91            thinking,
92        } => {
93            let text_tokens = text.as_deref().map(estimate_tokens).unwrap_or(0);
94            let tc_tokens: u64 = tool_calls
95                .iter()
96                .map(|tc| estimate_tokens(&tc.input.to_string()) + estimate_tokens(&tc.name) + 8)
97                .sum();
98            let thinking_tokens = thinking
99                .as_ref()
100                .map(|t| {
101                    estimate_tokens(&t.text)
102                        + t.signature.as_deref().map(estimate_tokens).unwrap_or(0)
103                })
104                .unwrap_or(0);
105            text_tokens + tc_tokens + thinking_tokens
106        }
107        ChatMessage::Tool { content, .. } => estimate_tokens(content) + 16,
108    };
109    tokens.max(1)
110}
111
112/// Sum the per-message estimates. Stable across providers because we
113/// only look at the rendered text / JSON size, not the wire shape.
114pub fn estimate_messages_tokens(messages: &[ChatMessage]) -> u64 {
115    messages.iter().map(estimate_chat_message_tokens).sum()
116}
117
118/// Per-model context window in tokens. Anthropic / OpenAI don't expose
119/// this through their wire APIs, so we keep a hand-encoded table and
120/// fall back to a conservative 128 000 (the smallest current "modern"
121/// model window — GPT-4o / Claude Haiku 3.5).
122///
123/// Adding a new model: extend the table. Unknown model strings fall to
124/// the default and a warning would be appropriate — but compaction is
125/// best-effort so we just default rather than refuse to run.
126pub fn resolve_context_window_tokens(model: &str) -> u64 {
127    let m = model.to_ascii_lowercase();
128    // Claude 4.6 / 4.7 1M context window (Sonnet / Opus extended).
129    if m.contains("opus-4-7") || m.contains("opus-4-6") || m.contains("sonnet-4-6") {
130        return 1_000_000;
131    }
132    // Anthropic Claude 3.x / 4.x: 200K
133    if m.contains("claude") {
134        return 200_000;
135    }
136    // OpenAI GPT-4 family: 128K
137    if m.contains("gpt-4") || m.contains("gpt-4o") || m.contains("gpt-4.1") {
138        return 128_000;
139    }
140    // OpenAI o1 / o3 reasoning: 200K
141    if m.starts_with("o1") || m.starts_with("o3") || m.starts_with("o4") {
142        return 200_000;
143    }
144    // MiniMax / DeepSeek / Groq commonly advertise 1M.
145    if m.contains("minimax") || m.contains("deepseek") {
146        return 1_000_000;
147    }
148    // Conservative default.
149    128_000
150}
151
152/// Context passed to `compact()`. Owns the model client + tools the
153/// summary call will use (mirrors the main agent's call so the prefix
154/// stays cache-hot).
155pub struct CompactionContext {
156    pub system_prompt: Option<String>,
157    pub model_client: Arc<dyn ModelClient>,
158    pub context_window_tokens: u64,
159    pub tools: Vec<ToolSpec>,
160}
161
162/// Result of a single `CompactionStrategy::compact` call. `messages` is
163/// the folded history caller installs; `usage` is the token spend for
164/// the summarize round trip (provider-reported via the same path as
165/// main turn calls). `usage` is `None` when the provider elides usage
166/// or when the strategy short-circuited without calling the model.
167#[derive(Debug, Clone, PartialEq)]
168pub struct CompactionOutcome {
169    pub messages: Vec<ChatMessage>,
170    pub usage: Option<crate::event::HarnessUsage>,
171}
172
173#[derive(Debug, thiserror::Error)]
174pub enum CompactionError {
175    #[error("compaction model call failed: {0}")]
176    ModelCall(#[from] ModelClientError),
177    /// The model returned an empty summary. We refuse to fold history
178    /// in that case — losing N turns of conversation for zero gain is
179    /// strictly worse than the original "ran out of context" failure.
180    /// Caller treats this as a no-op and lets the next turn try again.
181    #[error("model produced empty summary; refusing to fold history")]
182    EmptySummary,
183}
184
185#[async_trait]
186pub trait CompactionStrategy: Send + Sync {
187    /// Boolean gate. Pure (no side effects). Cheap to call every step.
188    fn should_compact(&self, messages: &[ChatMessage], context_window_tokens: u64) -> bool;
189
190    /// Fold history. Caller hands over the full `messages` list and
191    /// expects a shorter list back (typically `[summary, ...tail]`).
192    /// Failures bubble up; agent_loop treats them as "skip this turn's
193    /// compaction" rather than failing the whole turn.
194    async fn compact(
195        &self,
196        messages: Vec<ChatMessage>,
197        ctx: &CompactionContext,
198    ) -> Result<CompactionOutcome, CompactionError>;
199}
200
201/// Codex-style local compaction: send the full history to the model with a
202/// handoff-summary prompt appended, collect the reply as a checkpoint, then
203/// rebuild history as `[...retained user messages, summary]`.
204///
205/// Design rationale:
206///   * User messages are preserved verbatim (up to `user_message_token_budget`)
207///     because they carry precise constraints and goals that paraphrasing loses.
208///   * Assistant / tool messages are folded into the summary — they are large
209///     but low-density and tolerate lossy compression.
210///   * The summary is placed LAST so the model reads it as the most recent
211///     context rather than as background preamble.
212///   * Reusing the same model client keeps the Anthropic prompt-cache prefix
213///     hot (identical system + tools on every call).
214pub struct SummarizeCompactionStrategy {
215    pub trigger_fraction: f64,
216    /// Minimum total message count below which compaction is skipped.
217    /// Does not control retention; use `user_message_token_budget` for that.
218    pub tail_min_messages: usize,
219    pub summary_max_tokens: i32,
220    pub summary_prompt: String,
221    /// Token budget for verbatim user-message retention in the replacement history.
222    pub user_message_token_budget: u64,
223}
224
225impl Default for SummarizeCompactionStrategy {
226    fn default() -> Self {
227        Self {
228            trigger_fraction: DEFAULT_TRIGGER_FRACTION,
229            tail_min_messages: DEFAULT_TAIL_MIN_MESSAGES,
230            summary_max_tokens: DEFAULT_SUMMARY_MAX_TOKENS,
231            summary_prompt: DEFAULT_SUMMARY_PROMPT.into(),
232            user_message_token_budget: DEFAULT_USER_MESSAGE_TOKEN_BUDGET,
233        }
234    }
235}
236
237impl SummarizeCompactionStrategy {
238    pub fn with_trigger_fraction(mut self, fraction: f64) -> Self {
239        self.trigger_fraction = fraction;
240        self
241    }
242
243    pub fn with_tail_min_messages(mut self, n: usize) -> Self {
244        self.tail_min_messages = n;
245        self
246    }
247
248    pub fn with_summary_max_tokens(mut self, n: i32) -> Self {
249        self.summary_max_tokens = n;
250        self
251    }
252
253    pub fn with_user_message_token_budget(mut self, budget: u64) -> Self {
254        self.user_message_token_budget = budget;
255        self
256    }
257}
258
259/// Handoff-oriented summarise prompt. Instructs the model to produce a
260/// structured checkpoint for *another agent instance* to resume from —
261/// not a human-readable recap. Mirrors Codex's `SUMMARIZATION_PROMPT`.
262pub const DEFAULT_SUMMARY_PROMPT: &str = "You are performing a CONTEXT CHECKPOINT COMPACTION. \
263    Create a handoff summary for another agent instance that will resume this task.\n\n\
264    Include:\n\
265    - Current progress and key decisions made\n\
266    - Important context, constraints, or user preferences that must be respected\n\
267    - What remains to be done (clear next steps)\n\
268    - Any critical data, file paths, command outputs, or references needed to continue\n\n\
269    If a prior <conversation-summary> block exists in this conversation, produce an UPDATED \
270    summary that supersedes it (incorporating all activity since). \
271    Output only the summary text — no preamble, no closing remarks.";
272
273#[async_trait]
274impl CompactionStrategy for SummarizeCompactionStrategy {
275    fn should_compact(&self, messages: &[ChatMessage], context_window_tokens: u64) -> bool {
276        // Too few messages → never compact; the summarize call would
277        // cost more than the prefix it's saving.
278        if messages.len() <= self.tail_min_messages {
279            return false;
280        }
281        let tokens = estimate_messages_tokens(messages);
282        let threshold = ((context_window_tokens as f64) * self.trigger_fraction).round() as u64;
283        tokens > threshold
284    }
285
286    async fn compact(
287        &self,
288        messages: Vec<ChatMessage>,
289        ctx: &CompactionContext,
290    ) -> Result<CompactionOutcome, CompactionError> {
291        // Hard floor — refuse to compact if we'd end up with fewer than
292        // the tail count. Keeps the strategy idempotent in degenerate
293        // cases. `usage: None` because no model call ran.
294        if messages.len() <= self.tail_min_messages {
295            return Ok(CompactionOutcome {
296                messages,
297                usage: None,
298            });
299        }
300
301        // Build the summarize request. Same system + same tools as the
302        // main agent would use, then append one User message asking
303        // for the summary. We DON'T set tools: vec![] — keeping them
304        // makes the prefix bytes match what the main call sent, which
305        // is what Anthropic's cache compares.
306        let mut summarize_messages = messages.clone();
307        summarize_messages.push(ChatMessage::User {
308            content: self.summary_prompt.clone(),
309            attachments: vec![],
310        });
311        let request = ModelTurnInput {
312            system_prompt: ctx.system_prompt.clone(),
313            messages: summarize_messages,
314            tools: ctx.tools.clone(),
315            tool_choice: crate::model::ToolChoice::Auto,
316            parallel_tool_calls: None,
317        };
318
319        // Drain the stream into a single response — compaction doesn't
320        // care about token-level emit; it just needs the text. The
321        // model client trait's default `next` does this for us, but we
322        // go through stream + collect explicitly so future Anthropic-
323        // path strategies can elide tool calls / thinking blocks if
324        // they want to.
325        let stream = ctx.model_client.stream(request).await?;
326        let response = collect_model_response(stream).await?;
327        let (summary_text, usage) = match response {
328            ModelResponse::Message { text, usage, .. } => (text, usage),
329            // Model decided to call a tool instead of answering — give
330            // up on this round, history stays put.
331            ModelResponse::ToolCall { .. } => return Err(CompactionError::EmptySummary),
332        };
333        if summary_text.trim().is_empty() {
334            return Err(CompactionError::EmptySummary);
335        }
336
337        // Collect all true user messages from history, skipping prior summary
338        // messages (they are superseded by the new checkpoint we just generated).
339        let user_texts = collect_user_message_texts(&messages);
340        if user_texts.is_empty() {
341            // No real user messages to retain — skip installing the summary.
342            // Surface usage so HR can account for the (now-discarded) model call.
343            return Ok(CompactionOutcome { messages, usage });
344        }
345
346        // Build replacement history: retained user messages first, summary last.
347        // User messages are selected newest-first within the token budget then
348        // reversed to chronological order. Placing the summary last means the
349        // model reads the most recent context at the end of the prompt.
350        let out = build_compacted_history(
351            &user_texts,
352            &summary_text,
353            self.user_message_token_budget,
354        );
355        Ok(CompactionOutcome {
356            messages: out,
357            usage,
358        })
359    }
360}
361
362fn serialize_summary(summary: &str) -> String {
363    format!("<conversation-summary>\n{summary}\n</conversation-summary>")
364}
365
366/// Collect the text of every real `User` message in `messages`, in order,
367/// filtering out prior summary messages. Prior summaries are superseded by
368/// the new checkpoint and must not be recycled into the replacement history.
369fn collect_user_message_texts(messages: &[ChatMessage]) -> Vec<String> {
370    messages
371        .iter()
372        .filter_map(|m| match m {
373            ChatMessage::User { content, .. } if !is_summary_message(content) => {
374                Some(content.clone())
375            }
376            _ => None,
377        })
378        .collect()
379}
380
381fn is_summary_message(content: &str) -> bool {
382    content.trim_start().starts_with("<conversation-summary>")
383}
384
385/// Build the replacement history: retained user messages (chronological order)
386/// followed by the summary as the final message.
387///
388/// `user_texts` is the full list of real user messages oldest→newest.
389/// Messages are selected newest-first within `token_budget`; if the oldest
390/// selected message only partially fits, it is truncated rather than dropped.
391fn build_compacted_history(
392    user_texts: &[String],
393    summary_text: &str,
394    token_budget: u64,
395) -> Vec<ChatMessage> {
396    let mut selected: Vec<String> = Vec::new();
397    let mut remaining = token_budget;
398    for text in user_texts.iter().rev() {
399        if remaining == 0 {
400            break;
401        }
402        let tokens = estimate_tokens(text);
403        if tokens <= remaining {
404            selected.push(text.clone());
405            remaining -= tokens;
406        } else {
407            // Partially fits: truncate rather than skip so the budget is not
408            // wasted and the oldest retained message still carries context.
409            selected.push(truncate_to_token_budget(text, remaining));
410            break;
411        }
412    }
413    selected.reverse(); // restore chronological order
414    let mut out = Vec::with_capacity(selected.len() + 1);
415    for text in selected {
416        out.push(ChatMessage::User {
417            content: text,
418            attachments: vec![],
419        });
420    }
421    // Summary goes last — the model reads this as the most recent context.
422    out.push(ChatMessage::User {
423        content: serialize_summary(summary_text),
424        attachments: vec![],
425    });
426    out
427}
428
429/// Truncate `s` to at most `budget` estimated tokens using the same
430/// mixed-script estimator as `estimate_tokens`. Cuts at the last complete
431/// character that keeps the running estimate within `budget`.
432fn truncate_to_token_budget(s: &str, budget: u64) -> String {
433    if budget == 0 {
434        return String::new();
435    }
436    let mut ascii: u64 = 0;
437    let mut non_ascii: u64 = 0;
438    let mut end = 0usize;
439    for (byte_pos, c) in s.char_indices() {
440        let (na, nn) = if c.is_ascii() {
441            (ascii + 1, non_ascii)
442        } else {
443            (ascii, non_ascii + 1)
444        };
445        if na.div_ceil(4) + nn > budget {
446            break;
447        }
448        ascii = na;
449        non_ascii = nn;
450        end = byte_pos + c.len_utf8();
451    }
452    s[..end].to_string()
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458    use crate::model::{ModelChunk, ModelClient};
459    use crate::tools::ToolInvocation;
460    use async_trait::async_trait;
461    use futures::stream::{BoxStream, StreamExt};
462
463    /// In-process model client that returns a fixed summary string.
464    /// Test fixture only — production summarisation goes through the
465    /// real model client.
466    #[derive(Clone)]
467    struct FixedSummaryClient {
468        summary: String,
469    }
470    #[async_trait]
471    impl ModelClient for FixedSummaryClient {
472        async fn stream(
473            &self,
474            _input: ModelTurnInput,
475        ) -> Result<BoxStream<'static, Result<ModelChunk, ModelClientError>>, ModelClientError>
476        {
477            let chunks = vec![
478                Ok(ModelChunk::TextDelta {
479                    msg_id: "sum".into(),
480                    delta: self.summary.clone(),
481                }),
482                Ok(ModelChunk::Done {
483                    stop_reason: "end_turn".into(),
484                    usage: None,
485                }),
486            ];
487            Ok(futures::stream::iter(chunks).boxed())
488        }
489    }
490
491    fn user(s: &str) -> ChatMessage {
492        ChatMessage::User {
493            content: s.into(),
494            attachments: vec![],
495        }
496    }
497
498    fn assistant_text(s: &str) -> ChatMessage {
499        ChatMessage::Assistant {
500            text: Some(s.into()),
501            tool_calls: vec![],
502            thinking: None,
503        }
504    }
505
506    fn tool_msg(id: &str, content: &str) -> ChatMessage {
507        ChatMessage::Tool {
508            tool_call_id: id.into(),
509            content: content.into(),
510            is_error: false,
511            attachments: vec![],
512        }
513    }
514
515    #[test]
516    fn token_estimate_grows_with_content_size() {
517        let small = user("hi");
518        let big = user(&"x".repeat(8000));
519        assert!(estimate_chat_message_tokens(&big) > estimate_chat_message_tokens(&small));
520    }
521
522    #[test]
523    fn estimate_tokens_splits_ascii_and_cjk() {
524        // ASCII at 4 chars/token (ceil), non-ASCII at 1 token/char,
525        // whitespace-only is free.
526        assert_eq!(estimate_tokens(""), 0);
527        assert_eq!(estimate_tokens("   \n"), 0);
528        assert_eq!(estimate_tokens("abcd"), 1);
529        assert_eq!(estimate_tokens("abcde"), 2); // ceil(5/4)
530        assert_eq!(estimate_tokens("你好世界"), 4); // 4 CJK chars = 4 tokens
531        assert_eq!(estimate_tokens("hi你好"), 3); // ceil(2/4)=1 + 2
532    }
533
534    #[test]
535    fn token_estimate_counts_cjk_near_one_per_char() {
536        // 1000 CJK chars ≈ 1000 tokens. The old bytes/4 heuristic said
537        // ~750 (3 UTF-8 bytes / 4); the rune-aware estimator must not
538        // undercount, or compaction triggers too late on Chinese text.
539        let cjk = user(&"汉".repeat(1000));
540        let estimate = estimate_chat_message_tokens(&cjk);
541        assert!(
542            estimate >= 1000,
543            "CJK undercounted: got {estimate}, want >= 1000"
544        );
545    }
546
547    #[test]
548    fn token_estimate_includes_tool_call_input() {
549        // Same text length, but assistant carrying a tool_call should
550        // cost more (we count the JSON arguments).
551        let bare = assistant_text("done");
552        let with_tool = ChatMessage::Assistant {
553            text: Some("done".into()),
554            tool_calls: vec![ToolInvocation {
555                id: "tc".into(),
556                name: "bash".into(),
557                input: serde_json::json!({"command": "echo lots of bytes here for sure"}),
558            }],
559            thinking: None,
560        };
561        assert!(estimate_chat_message_tokens(&with_tool) > estimate_chat_message_tokens(&bare));
562    }
563
564    #[test]
565    fn context_window_table_known_models() {
566        assert_eq!(resolve_context_window_tokens("claude-opus-4-7"), 1_000_000);
567        assert_eq!(
568            resolve_context_window_tokens("claude-sonnet-4-6"),
569            1_000_000
570        );
571        assert_eq!(resolve_context_window_tokens("claude-haiku-4-5"), 200_000);
572        assert_eq!(resolve_context_window_tokens("claude-3-5-sonnet"), 200_000);
573        assert_eq!(resolve_context_window_tokens("gpt-4o"), 128_000);
574        assert_eq!(resolve_context_window_tokens("gpt-4.1-mini"), 128_000);
575        assert_eq!(resolve_context_window_tokens("o3-mini"), 200_000);
576        assert_eq!(resolve_context_window_tokens("MiniMax-M2"), 1_000_000);
577        // Unknown → conservative default.
578        assert_eq!(resolve_context_window_tokens("unknown-model"), 128_000);
579    }
580
581    #[test]
582    fn should_compact_skips_when_below_threshold() {
583        let strat = SummarizeCompactionStrategy::default();
584        let messages = vec![user("hello"), assistant_text("hi")];
585        // 200K window, tiny conversation — never fires.
586        assert!(!strat.should_compact(&messages, 200_000));
587    }
588
589    #[test]
590    fn should_compact_fires_when_above_threshold() {
591        let strat = SummarizeCompactionStrategy::default();
592        // 5 messages * 8000 ASCII chars each ≈ 10K tokens.
593        // With an 11K window the 90% threshold is 9 900 tokens, so
594        // 10K tokens exceeds it and compaction must fire.
595        let messages = vec![
596            user(&"x".repeat(8000)),
597            assistant_text(&"y".repeat(8000)),
598            user(&"x".repeat(8000)),
599            assistant_text(&"y".repeat(8000)),
600            user(&"x".repeat(8000)),
601        ];
602        assert!(strat.should_compact(&messages, 11_000));
603    }
604
605    #[test]
606    fn should_compact_respects_tail_min_floor() {
607        let strat = SummarizeCompactionStrategy::default();
608        // Bigger than threshold but fewer than tail_min_messages — skip.
609        let messages = vec![
610            user(&"x".repeat(100_000)),
611            assistant_text(&"y".repeat(100_000)),
612        ];
613        assert!(!strat.should_compact(&messages, 1_000));
614    }
615
616    #[tokio::test]
617    async fn compact_folds_history_into_summary_plus_tail() {
618        let strat = SummarizeCompactionStrategy::default().with_tail_min_messages(2);
619        let ctx = CompactionContext {
620            system_prompt: None,
621            model_client: Arc::new(FixedSummaryClient {
622                summary: "we ran ls and grep".into(),
623            }),
624            context_window_tokens: 10_000,
625            tools: vec![],
626        };
627        let messages = vec![
628            user("first user"),
629            assistant_text("response 1"),
630            user("second user"),
631            tool_msg("tc1", "tool result"),
632            user("third user"),
633            assistant_text("final response"),
634        ];
635        let outcome = strat.compact(messages, &ctx).await.unwrap();
636        let out = outcome.messages;
637        // All three real user messages fit within the 20 000-token budget and
638        // are retained verbatim. The summary is appended as the final message.
639        assert_eq!(out.len(), 4, "3 user messages + 1 summary");
640        match &out[0] {
641            ChatMessage::User { content, .. } => assert_eq!(content, "first user"),
642            other => panic!("expected User at [0], got {other:?}"),
643        }
644        match &out[1] {
645            ChatMessage::User { content, .. } => assert_eq!(content, "second user"),
646            other => panic!("expected User at [1], got {other:?}"),
647        }
648        match &out[2] {
649            ChatMessage::User { content, .. } => assert_eq!(content, "third user"),
650            other => panic!("expected User at [2], got {other:?}"),
651        }
652        // Summary is the last message.
653        assert!(
654            matches!(&out[3], ChatMessage::User { content, .. }
655                if content.contains("<conversation-summary>") && content.contains("we ran ls and grep"))
656        );
657        // Output is shorter than input (6 messages → 4).
658        assert!(out.len() < 6);
659        // FixedSummaryClient doesn't report usage → outcome.usage is None.
660        assert!(outcome.usage.is_none());
661    }
662
663    #[tokio::test]
664    async fn compact_returns_empty_summary_error_on_blank_response() {
665        let strat = SummarizeCompactionStrategy::default().with_tail_min_messages(2);
666        let ctx = CompactionContext {
667            system_prompt: None,
668            model_client: Arc::new(FixedSummaryClient { summary: "".into() }),
669            context_window_tokens: 10_000,
670            tools: vec![],
671        };
672        let messages = vec![
673            user("a"),
674            assistant_text("b"),
675            user("c"),
676            assistant_text("d"),
677        ];
678        let err = strat.compact(messages, &ctx).await.unwrap_err();
679        assert!(matches!(err, CompactionError::EmptySummary));
680    }
681
682    #[tokio::test]
683    async fn compact_skips_when_messages_at_or_below_tail_min() {
684        let strat = SummarizeCompactionStrategy::default().with_tail_min_messages(4);
685        let ctx = CompactionContext {
686            system_prompt: None,
687            model_client: Arc::new(FixedSummaryClient {
688                summary: "irrelevant".into(),
689            }),
690            context_window_tokens: 1_000,
691            tools: vec![],
692        };
693        let messages = vec![
694            user("1"),
695            assistant_text("2"),
696            user("3"),
697            assistant_text("4"),
698        ];
699        let outcome = strat.compact(messages.clone(), &ctx).await.unwrap();
700        // Same messages back — no compaction happened, no model call.
701        assert_eq!(outcome.messages, messages);
702        assert!(outcome.usage.is_none());
703    }
704}