agent-harness-rs 0.2.2

Agent loop harness with local and sandbox tool runtimes, context management, and MCP support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
//! Context-window-aware compaction.
//!
//! Long-running native sessions accumulate a `messages` history that
//! eventually exceeds the model's context window. Without intervention
//! the next turn either truncates server-side (silently losing context)
//! or fails with a 400 / `context_length_exceeded`. Compaction folds the
//! mid-conversation into a `<conversation-summary>` checkpoint while
//! preserving all true user messages verbatim within a token budget —
//! same idea as Codex local compaction.
//!
//! Strategy contract (`CompactionStrategy`):
//!   * `should_compact` — pure boolean gate; the agent loop checks this
//!     each step BEFORE building the next model request.
//!   * `compact` — best-effort; returns a fresh `Vec<ChatMessage>` that
//!     replaces the running history. Caller emits a separate event so
//!     `native_adapter` can mark the boundary on the wire.
//!
//! Producer of compaction:
//!   * `agent_loop::run_loop` constructs `SummarizeCompactionStrategy` per
//!     turn (cheap — no internal state) and calls `should_compact` /
//!     `compact` between steps.
//!
//! The strategy reuses the session's primary `ModelClient` to produce
//! the summary; that keeps cache keys hot on the prefix shared with the
//! main agent call (Anthropic prompt cache hits straight through).

use async_trait::async_trait;
use std::sync::Arc;

use crate::model::{
    collect_model_response, ChatMessage, ModelClient, ModelClientError, ModelResponse,
    ModelTurnInput,
};
use crate::tools::ToolSpec;

/// Fraction of the model's context window above which compaction fires.
/// 0.90 leaves a ~10 % headroom for the in-flight turn's tool results and
/// the model's reply, matching Codex's default threshold.
pub const DEFAULT_TRIGGER_FRACTION: f64 = 0.90;

/// Minimum number of messages the history must contain before compaction
/// is allowed to run. Below this floor the history is too short to benefit
/// from compaction and the call is a no-op.
pub const DEFAULT_TAIL_MIN_MESSAGES: usize = 4;

/// Soft cap on summary length when serialised back into a `User`
/// `<conversation-summary>` block. Above this we let the model decide,
/// but pass `max_tokens` hint so it doesn't ramble. 2 000 ≈ 8 KB —
/// enough for most multi-turn dialogues; OMA uses the same default.
pub const DEFAULT_SUMMARY_MAX_TOKENS: i32 = 2_000;

/// Token budget for verbatim user-message retention in the replacement
/// history. All true user messages are collected from the full history
/// (oldest to newest) and as many as fit within this budget are kept
/// verbatim — newest first. Matches Codex's `COMPACT_USER_MESSAGE_MAX_TOKENS`.
pub const DEFAULT_USER_MESSAGE_TOKEN_BUDGET: u64 = 20_000;

/// Mixed-script token estimator:
/// ASCII runs compress at ~4 chars/token, while CJK and other non-ASCII
/// text tokenizes at ≈1 token per char on every modern BPE vocabulary. The
/// previous flat `bytes / 4` heuristic underestimated Chinese text ~3×
/// (one CJK char = 3 UTF-8 bytes → counted as 0.75 tokens instead of ~1),
/// so compaction fired far too late on Chinese-heavy conversations.
/// Whitespace-only input costs 0. Still an estimator — not for billing.
pub fn estimate_tokens(s: &str) -> u64 {
    if s.trim().is_empty() {
        return 0;
    }
    let mut ascii: u64 = 0;
    let mut non_ascii: u64 = 0;
    for c in s.chars() {
        if c.is_ascii() {
            ascii += 1;
        } else {
            non_ascii += 1;
        }
    }
    ascii.div_ceil(4) + non_ascii
}

/// Per-message estimate: `estimate_tokens` over every text part, plus
/// small fixed overheads for structural wrapping (tool-call envelope ≈ 8
/// tokens, tool-result envelope ≈ 16 — same budgets as the old byte-based
/// +32 / +64 at 4 bytes/token). Floors at 1 so empty messages still cost.
pub fn estimate_chat_message_tokens(m: &ChatMessage) -> u64 {
    let tokens = match m {
        ChatMessage::User { content, .. } => estimate_tokens(content),
        ChatMessage::Assistant {
            text,
            tool_calls,
            thinking,
        } => {
            let text_tokens = text.as_deref().map(estimate_tokens).unwrap_or(0);
            let tc_tokens: u64 = tool_calls
                .iter()
                .map(|tc| estimate_tokens(&tc.input.to_string()) + estimate_tokens(&tc.name) + 8)
                .sum();
            let thinking_tokens = thinking
                .as_ref()
                .map(|t| {
                    estimate_tokens(&t.text)
                        + t.signature.as_deref().map(estimate_tokens).unwrap_or(0)
                })
                .unwrap_or(0);
            text_tokens + tc_tokens + thinking_tokens
        }
        ChatMessage::Tool { content, .. } => estimate_tokens(content) + 16,
    };
    tokens.max(1)
}

/// Sum the per-message estimates. Stable across providers because we
/// only look at the rendered text / JSON size, not the wire shape.
pub fn estimate_messages_tokens(messages: &[ChatMessage]) -> u64 {
    messages.iter().map(estimate_chat_message_tokens).sum()
}

/// Per-model context window in tokens. Anthropic / OpenAI don't expose
/// this through their wire APIs, so we keep a hand-encoded table and
/// fall back to a conservative 128 000 (the smallest current "modern"
/// model window — GPT-4o / Claude Haiku 3.5).
///
/// Adding a new model: extend the table. Unknown model strings fall to
/// the default and a warning would be appropriate — but compaction is
/// best-effort so we just default rather than refuse to run.
pub fn resolve_context_window_tokens(model: &str) -> u64 {
    let m = model.to_ascii_lowercase();
    // Claude 4.6 / 4.7 1M context window (Sonnet / Opus extended).
    if m.contains("opus-4-7") || m.contains("opus-4-6") || m.contains("sonnet-4-6") {
        return 1_000_000;
    }
    // Anthropic Claude 3.x / 4.x: 200K
    if m.contains("claude") {
        return 200_000;
    }
    // OpenAI GPT-4 family: 128K
    if m.contains("gpt-4") || m.contains("gpt-4o") || m.contains("gpt-4.1") {
        return 128_000;
    }
    // OpenAI o1 / o3 reasoning: 200K
    if m.starts_with("o1") || m.starts_with("o3") || m.starts_with("o4") {
        return 200_000;
    }
    // MiniMax / DeepSeek / Groq commonly advertise 1M.
    if m.contains("minimax") || m.contains("deepseek") {
        return 1_000_000;
    }
    // Conservative default.
    128_000
}

/// Context passed to `compact()`. Owns the model client + tools the
/// summary call will use (mirrors the main agent's call so the prefix
/// stays cache-hot).
pub struct CompactionContext {
    pub system_prompt: Option<String>,
    pub model_client: Arc<dyn ModelClient>,
    pub context_window_tokens: u64,
    pub tools: Vec<ToolSpec>,
}

/// Result of a single `CompactionStrategy::compact` call. `messages` is
/// the folded history caller installs; `usage` is the token spend for
/// the summarize round trip (provider-reported via the same path as
/// main turn calls). `usage` is `None` when the provider elides usage
/// or when the strategy short-circuited without calling the model.
#[derive(Debug, Clone, PartialEq)]
pub struct CompactionOutcome {
    pub messages: Vec<ChatMessage>,
    pub usage: Option<crate::event::HarnessUsage>,
}

#[derive(Debug, thiserror::Error)]
pub enum CompactionError {
    #[error("compaction model call failed: {0}")]
    ModelCall(#[from] ModelClientError),
    /// The model returned an empty summary. We refuse to fold history
    /// in that case — losing N turns of conversation for zero gain is
    /// strictly worse than the original "ran out of context" failure.
    /// Caller treats this as a no-op and lets the next turn try again.
    #[error("model produced empty summary; refusing to fold history")]
    EmptySummary,
}

#[async_trait]
pub trait CompactionStrategy: Send + Sync {
    /// Boolean gate. Pure (no side effects). Cheap to call every step.
    fn should_compact(&self, messages: &[ChatMessage], context_window_tokens: u64) -> bool;

    /// Fold history. Caller hands over the full `messages` list and
    /// expects a shorter list back (typically `[summary, ...tail]`).
    /// Failures bubble up; agent_loop treats them as "skip this turn's
    /// compaction" rather than failing the whole turn.
    async fn compact(
        &self,
        messages: Vec<ChatMessage>,
        ctx: &CompactionContext,
    ) -> Result<CompactionOutcome, CompactionError>;
}

/// Codex-style local compaction: send the full history to the model with a
/// handoff-summary prompt appended, collect the reply as a checkpoint, then
/// rebuild history as `[...retained user messages, summary]`.
///
/// Design rationale:
///   * User messages are preserved verbatim (up to `user_message_token_budget`)
///     because they carry precise constraints and goals that paraphrasing loses.
///   * Assistant / tool messages are folded into the summary — they are large
///     but low-density and tolerate lossy compression.
///   * The summary is placed LAST so the model reads it as the most recent
///     context rather than as background preamble.
///   * Reusing the same model client keeps the Anthropic prompt-cache prefix
///     hot (identical system + tools on every call).
pub struct SummarizeCompactionStrategy {
    pub trigger_fraction: f64,
    /// Minimum total message count below which compaction is skipped.
    /// Does not control retention; use `user_message_token_budget` for that.
    pub tail_min_messages: usize,
    pub summary_max_tokens: i32,
    pub summary_prompt: String,
    /// Token budget for verbatim user-message retention in the replacement history.
    pub user_message_token_budget: u64,
}

impl Default for SummarizeCompactionStrategy {
    fn default() -> Self {
        Self {
            trigger_fraction: DEFAULT_TRIGGER_FRACTION,
            tail_min_messages: DEFAULT_TAIL_MIN_MESSAGES,
            summary_max_tokens: DEFAULT_SUMMARY_MAX_TOKENS,
            summary_prompt: DEFAULT_SUMMARY_PROMPT.into(),
            user_message_token_budget: DEFAULT_USER_MESSAGE_TOKEN_BUDGET,
        }
    }
}

impl SummarizeCompactionStrategy {
    pub fn with_trigger_fraction(mut self, fraction: f64) -> Self {
        self.trigger_fraction = fraction;
        self
    }

    pub fn with_tail_min_messages(mut self, n: usize) -> Self {
        self.tail_min_messages = n;
        self
    }

    pub fn with_summary_max_tokens(mut self, n: i32) -> Self {
        self.summary_max_tokens = n;
        self
    }

    pub fn with_user_message_token_budget(mut self, budget: u64) -> Self {
        self.user_message_token_budget = budget;
        self
    }
}

/// Handoff-oriented summarise prompt. Instructs the model to produce a
/// structured checkpoint for *another agent instance* to resume from —
/// not a human-readable recap. Mirrors Codex's `SUMMARIZATION_PROMPT`.
pub const DEFAULT_SUMMARY_PROMPT: &str = "You are performing a CONTEXT CHECKPOINT COMPACTION. \
    Create a handoff summary for another agent instance that will resume this task.\n\n\
    Include:\n\
    - Current progress and key decisions made\n\
    - Important context, constraints, or user preferences that must be respected\n\
    - What remains to be done (clear next steps)\n\
    - Any critical data, file paths, command outputs, or references needed to continue\n\n\
    If a prior <conversation-summary> block exists in this conversation, produce an UPDATED \
    summary that supersedes it (incorporating all activity since). \
    Output only the summary text — no preamble, no closing remarks.";

#[async_trait]
impl CompactionStrategy for SummarizeCompactionStrategy {
    fn should_compact(&self, messages: &[ChatMessage], context_window_tokens: u64) -> bool {
        // Too few messages → never compact; the summarize call would
        // cost more than the prefix it's saving.
        if messages.len() <= self.tail_min_messages {
            return false;
        }
        let tokens = estimate_messages_tokens(messages);
        let threshold = ((context_window_tokens as f64) * self.trigger_fraction).round() as u64;
        tokens > threshold
    }

    async fn compact(
        &self,
        messages: Vec<ChatMessage>,
        ctx: &CompactionContext,
    ) -> Result<CompactionOutcome, CompactionError> {
        // Hard floor — refuse to compact if we'd end up with fewer than
        // the tail count. Keeps the strategy idempotent in degenerate
        // cases. `usage: None` because no model call ran.
        if messages.len() <= self.tail_min_messages {
            return Ok(CompactionOutcome {
                messages,
                usage: None,
            });
        }

        // Build the summarize request. Same system + same tools as the
        // main agent would use, then append one User message asking
        // for the summary. We DON'T set tools: vec![] — keeping them
        // makes the prefix bytes match what the main call sent, which
        // is what Anthropic's cache compares.
        let mut summarize_messages = messages.clone();
        summarize_messages.push(ChatMessage::User {
            content: self.summary_prompt.clone(),
            attachments: vec![],
        });
        let request = ModelTurnInput {
            system_prompt: ctx.system_prompt.clone(),
            messages: summarize_messages,
            tools: ctx.tools.clone(),
            tool_choice: crate::model::ToolChoice::Auto,
            parallel_tool_calls: None,
        };

        // Drain the stream into a single response — compaction doesn't
        // care about token-level emit; it just needs the text. The
        // model client trait's default `next` does this for us, but we
        // go through stream + collect explicitly so future Anthropic-
        // path strategies can elide tool calls / thinking blocks if
        // they want to.
        let stream = ctx.model_client.stream(request).await?;
        let response = collect_model_response(stream).await?;
        let (summary_text, usage) = match response {
            ModelResponse::Message { text, usage, .. } => (text, usage),
            // Model decided to call a tool instead of answering — give
            // up on this round, history stays put.
            ModelResponse::ToolCall { .. } => return Err(CompactionError::EmptySummary),
        };
        if summary_text.trim().is_empty() {
            return Err(CompactionError::EmptySummary);
        }

        // Collect all true user messages from history, skipping prior summary
        // messages (they are superseded by the new checkpoint we just generated).
        let user_texts = collect_user_message_texts(&messages);
        if user_texts.is_empty() {
            // No real user messages to retain — skip installing the summary.
            // Surface usage so HR can account for the (now-discarded) model call.
            return Ok(CompactionOutcome { messages, usage });
        }

        // Build replacement history: retained user messages first, summary last.
        // User messages are selected newest-first within the token budget then
        // reversed to chronological order. Placing the summary last means the
        // model reads the most recent context at the end of the prompt.
        let out = build_compacted_history(
            &user_texts,
            &summary_text,
            self.user_message_token_budget,
        );
        Ok(CompactionOutcome {
            messages: out,
            usage,
        })
    }
}

fn serialize_summary(summary: &str) -> String {
    format!("<conversation-summary>\n{summary}\n</conversation-summary>")
}

/// Collect the text of every real `User` message in `messages`, in order,
/// filtering out prior summary messages. Prior summaries are superseded by
/// the new checkpoint and must not be recycled into the replacement history.
fn collect_user_message_texts(messages: &[ChatMessage]) -> Vec<String> {
    messages
        .iter()
        .filter_map(|m| match m {
            ChatMessage::User { content, .. } if !is_summary_message(content) => {
                Some(content.clone())
            }
            _ => None,
        })
        .collect()
}

fn is_summary_message(content: &str) -> bool {
    content.trim_start().starts_with("<conversation-summary>")
}

/// Build the replacement history: retained user messages (chronological order)
/// followed by the summary as the final message.
///
/// `user_texts` is the full list of real user messages oldest→newest.
/// Messages are selected newest-first within `token_budget`; if the oldest
/// selected message only partially fits, it is truncated rather than dropped.
fn build_compacted_history(
    user_texts: &[String],
    summary_text: &str,
    token_budget: u64,
) -> Vec<ChatMessage> {
    let mut selected: Vec<String> = Vec::new();
    let mut remaining = token_budget;
    for text in user_texts.iter().rev() {
        if remaining == 0 {
            break;
        }
        let tokens = estimate_tokens(text);
        if tokens <= remaining {
            selected.push(text.clone());
            remaining -= tokens;
        } else {
            // Partially fits: truncate rather than skip so the budget is not
            // wasted and the oldest retained message still carries context.
            selected.push(truncate_to_token_budget(text, remaining));
            break;
        }
    }
    selected.reverse(); // restore chronological order
    let mut out = Vec::with_capacity(selected.len() + 1);
    for text in selected {
        out.push(ChatMessage::User {
            content: text,
            attachments: vec![],
        });
    }
    // Summary goes last — the model reads this as the most recent context.
    out.push(ChatMessage::User {
        content: serialize_summary(summary_text),
        attachments: vec![],
    });
    out
}

/// Truncate `s` to at most `budget` estimated tokens using the same
/// mixed-script estimator as `estimate_tokens`. Cuts at the last complete
/// character that keeps the running estimate within `budget`.
fn truncate_to_token_budget(s: &str, budget: u64) -> String {
    if budget == 0 {
        return String::new();
    }
    let mut ascii: u64 = 0;
    let mut non_ascii: u64 = 0;
    let mut end = 0usize;
    for (byte_pos, c) in s.char_indices() {
        let (na, nn) = if c.is_ascii() {
            (ascii + 1, non_ascii)
        } else {
            (ascii, non_ascii + 1)
        };
        if na.div_ceil(4) + nn > budget {
            break;
        }
        ascii = na;
        non_ascii = nn;
        end = byte_pos + c.len_utf8();
    }
    s[..end].to_string()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::model::{ModelChunk, ModelClient};
    use crate::tools::ToolInvocation;
    use async_trait::async_trait;
    use futures::stream::{BoxStream, StreamExt};

    /// In-process model client that returns a fixed summary string.
    /// Test fixture only — production summarisation goes through the
    /// real model client.
    #[derive(Clone)]
    struct FixedSummaryClient {
        summary: String,
    }
    #[async_trait]
    impl ModelClient for FixedSummaryClient {
        async fn stream(
            &self,
            _input: ModelTurnInput,
        ) -> Result<BoxStream<'static, Result<ModelChunk, ModelClientError>>, ModelClientError>
        {
            let chunks = vec![
                Ok(ModelChunk::TextDelta {
                    msg_id: "sum".into(),
                    delta: self.summary.clone(),
                }),
                Ok(ModelChunk::Done {
                    stop_reason: "end_turn".into(),
                    usage: None,
                }),
            ];
            Ok(futures::stream::iter(chunks).boxed())
        }
    }

    fn user(s: &str) -> ChatMessage {
        ChatMessage::User {
            content: s.into(),
            attachments: vec![],
        }
    }

    fn assistant_text(s: &str) -> ChatMessage {
        ChatMessage::Assistant {
            text: Some(s.into()),
            tool_calls: vec![],
            thinking: None,
        }
    }

    fn tool_msg(id: &str, content: &str) -> ChatMessage {
        ChatMessage::Tool {
            tool_call_id: id.into(),
            content: content.into(),
            is_error: false,
            attachments: vec![],
        }
    }

    #[test]
    fn token_estimate_grows_with_content_size() {
        let small = user("hi");
        let big = user(&"x".repeat(8000));
        assert!(estimate_chat_message_tokens(&big) > estimate_chat_message_tokens(&small));
    }

    #[test]
    fn estimate_tokens_splits_ascii_and_cjk() {
        // ASCII at 4 chars/token (ceil), non-ASCII at 1 token/char,
        // whitespace-only is free.
        assert_eq!(estimate_tokens(""), 0);
        assert_eq!(estimate_tokens("   \n"), 0);
        assert_eq!(estimate_tokens("abcd"), 1);
        assert_eq!(estimate_tokens("abcde"), 2); // ceil(5/4)
        assert_eq!(estimate_tokens("你好世界"), 4); // 4 CJK chars = 4 tokens
        assert_eq!(estimate_tokens("hi你好"), 3); // ceil(2/4)=1 + 2
    }

    #[test]
    fn token_estimate_counts_cjk_near_one_per_char() {
        // 1000 CJK chars ≈ 1000 tokens. The old bytes/4 heuristic said
        // ~750 (3 UTF-8 bytes / 4); the rune-aware estimator must not
        // undercount, or compaction triggers too late on Chinese text.
        let cjk = user(&"".repeat(1000));
        let estimate = estimate_chat_message_tokens(&cjk);
        assert!(
            estimate >= 1000,
            "CJK undercounted: got {estimate}, want >= 1000"
        );
    }

    #[test]
    fn token_estimate_includes_tool_call_input() {
        // Same text length, but assistant carrying a tool_call should
        // cost more (we count the JSON arguments).
        let bare = assistant_text("done");
        let with_tool = ChatMessage::Assistant {
            text: Some("done".into()),
            tool_calls: vec![ToolInvocation {
                id: "tc".into(),
                name: "bash".into(),
                input: serde_json::json!({"command": "echo lots of bytes here for sure"}),
            }],
            thinking: None,
        };
        assert!(estimate_chat_message_tokens(&with_tool) > estimate_chat_message_tokens(&bare));
    }

    #[test]
    fn context_window_table_known_models() {
        assert_eq!(resolve_context_window_tokens("claude-opus-4-7"), 1_000_000);
        assert_eq!(
            resolve_context_window_tokens("claude-sonnet-4-6"),
            1_000_000
        );
        assert_eq!(resolve_context_window_tokens("claude-haiku-4-5"), 200_000);
        assert_eq!(resolve_context_window_tokens("claude-3-5-sonnet"), 200_000);
        assert_eq!(resolve_context_window_tokens("gpt-4o"), 128_000);
        assert_eq!(resolve_context_window_tokens("gpt-4.1-mini"), 128_000);
        assert_eq!(resolve_context_window_tokens("o3-mini"), 200_000);
        assert_eq!(resolve_context_window_tokens("MiniMax-M2"), 1_000_000);
        // Unknown → conservative default.
        assert_eq!(resolve_context_window_tokens("unknown-model"), 128_000);
    }

    #[test]
    fn should_compact_skips_when_below_threshold() {
        let strat = SummarizeCompactionStrategy::default();
        let messages = vec![user("hello"), assistant_text("hi")];
        // 200K window, tiny conversation — never fires.
        assert!(!strat.should_compact(&messages, 200_000));
    }

    #[test]
    fn should_compact_fires_when_above_threshold() {
        let strat = SummarizeCompactionStrategy::default();
        // 5 messages * 8000 ASCII chars each ≈ 10K tokens.
        // With an 11K window the 90% threshold is 9 900 tokens, so
        // 10K tokens exceeds it and compaction must fire.
        let messages = vec![
            user(&"x".repeat(8000)),
            assistant_text(&"y".repeat(8000)),
            user(&"x".repeat(8000)),
            assistant_text(&"y".repeat(8000)),
            user(&"x".repeat(8000)),
        ];
        assert!(strat.should_compact(&messages, 11_000));
    }

    #[test]
    fn should_compact_respects_tail_min_floor() {
        let strat = SummarizeCompactionStrategy::default();
        // Bigger than threshold but fewer than tail_min_messages — skip.
        let messages = vec![
            user(&"x".repeat(100_000)),
            assistant_text(&"y".repeat(100_000)),
        ];
        assert!(!strat.should_compact(&messages, 1_000));
    }

    #[tokio::test]
    async fn compact_folds_history_into_summary_plus_tail() {
        let strat = SummarizeCompactionStrategy::default().with_tail_min_messages(2);
        let ctx = CompactionContext {
            system_prompt: None,
            model_client: Arc::new(FixedSummaryClient {
                summary: "we ran ls and grep".into(),
            }),
            context_window_tokens: 10_000,
            tools: vec![],
        };
        let messages = vec![
            user("first user"),
            assistant_text("response 1"),
            user("second user"),
            tool_msg("tc1", "tool result"),
            user("third user"),
            assistant_text("final response"),
        ];
        let outcome = strat.compact(messages, &ctx).await.unwrap();
        let out = outcome.messages;
        // All three real user messages fit within the 20 000-token budget and
        // are retained verbatim. The summary is appended as the final message.
        assert_eq!(out.len(), 4, "3 user messages + 1 summary");
        match &out[0] {
            ChatMessage::User { content, .. } => assert_eq!(content, "first user"),
            other => panic!("expected User at [0], got {other:?}"),
        }
        match &out[1] {
            ChatMessage::User { content, .. } => assert_eq!(content, "second user"),
            other => panic!("expected User at [1], got {other:?}"),
        }
        match &out[2] {
            ChatMessage::User { content, .. } => assert_eq!(content, "third user"),
            other => panic!("expected User at [2], got {other:?}"),
        }
        // Summary is the last message.
        assert!(
            matches!(&out[3], ChatMessage::User { content, .. }
                if content.contains("<conversation-summary>") && content.contains("we ran ls and grep"))
        );
        // Output is shorter than input (6 messages → 4).
        assert!(out.len() < 6);
        // FixedSummaryClient doesn't report usage → outcome.usage is None.
        assert!(outcome.usage.is_none());
    }

    #[tokio::test]
    async fn compact_returns_empty_summary_error_on_blank_response() {
        let strat = SummarizeCompactionStrategy::default().with_tail_min_messages(2);
        let ctx = CompactionContext {
            system_prompt: None,
            model_client: Arc::new(FixedSummaryClient { summary: "".into() }),
            context_window_tokens: 10_000,
            tools: vec![],
        };
        let messages = vec![
            user("a"),
            assistant_text("b"),
            user("c"),
            assistant_text("d"),
        ];
        let err = strat.compact(messages, &ctx).await.unwrap_err();
        assert!(matches!(err, CompactionError::EmptySummary));
    }

    #[tokio::test]
    async fn compact_skips_when_messages_at_or_below_tail_min() {
        let strat = SummarizeCompactionStrategy::default().with_tail_min_messages(4);
        let ctx = CompactionContext {
            system_prompt: None,
            model_client: Arc::new(FixedSummaryClient {
                summary: "irrelevant".into(),
            }),
            context_window_tokens: 1_000,
            tools: vec![],
        };
        let messages = vec![
            user("1"),
            assistant_text("2"),
            user("3"),
            assistant_text("4"),
        ];
        let outcome = strat.compact(messages.clone(), &ctx).await.unwrap();
        // Same messages back — no compaction happened, no model call.
        assert_eq!(outcome.messages, messages);
        assert!(outcome.usage.is_none());
    }
}