xcodeai 2.1.0

Autonomous AI coding agent — zero human intervention, sbox sandboxed, OpenAI-compatible
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
// src/agent/context_manager.rs
//
// Smart context-window management for the xcodeai agent loop.
//
// # Problem
//
// Every LLM has a finite context window (e.g. 128k tokens ≈ 512k chars).
// As the conversation grows the message history eventually fills the window,
// causing either a context-length API error or silently degraded reasoning
// (the model can't "see" old messages any more).
//
// # Two strategies
//
// The config lets users choose between two strategies:
//
//   "truncate"   — drop the oldest messages (fast, no extra LLM call, lossy)
//   "summarize"  — send the oldest messages to the same LLM and ask for a
//                  compact summary, then replace those messages with the
//                  summary (slower, one extra LLM call, much less lossy)
//
// # Threshold
//
// We trigger management when the total char count exceeds
// `budget_chars * summarize_threshold` (default 80 % of 400 000 chars).
// This gives the agent a safety margin — we start compressing before the
// window is completely full, not after the API rejects the request.
//
// # Summarisation fallback
//
// If the summarisation LLM call fails for any reason (network error, model
// refuses, etc.) we silently fall back to the old truncation strategy so the
// agent loop always continues rather than crashing.
//
// # Rust learner notes
//
// - `async fn` inside `impl ContextManager` is straightforward; the only
//   requirement is that `ContextManager::manage()` is called with `.await`.
// - `tracing::warn!` and `tracing::info!` are used instead of `eprintln!`
//   so the messages go through the logging subsystem and can be filtered.
// - The `serde(default)` attribute on `ContextConfig` means old config files
//   that don't have the `"context"` key will silently use the defaults here.

use crate::llm::{LlmProvider, Message};
use anyhow::Result;
use serde::{Deserialize, Serialize};
use tracing::{info, warn};

// ─── Configuration ────────────────────────────────────────────────────────────

/// Strategy to use when the context window approaches capacity.
#[derive(Debug, Clone, PartialEq, Deserialize, Serialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum ContextStrategy {
    /// Drop oldest messages (fast, lossy).
    Truncate,
    /// Summarise oldest messages via an LLM call, then replace them (slower, less lossy).
    #[default]
    Summarize,
}

/// Configuration for context-window management.
///
/// Stored under `agent.context` in `config.json`.  Every field has a
/// `#[serde(default)]` fallback so old config files don't need updating.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(default)]
pub struct ContextConfig {
    /// Strategy to apply when the budget is approached.
    pub strategy: ContextStrategy,
    /// Total character budget (default 400 000 ≈ 100 000 tokens @ 4 chars/token).
    pub budget_chars: usize,
    /// Fraction of `budget_chars` at which management is triggered (default 0.80).
    /// A value of 0.80 means we act at 80 % usage, leaving 20 % headroom.
    pub threshold: f64,
    /// Maximum tokens for the summary response (default 800).
    /// Keeping this low ensures the summary LLM call is cheap and fast.
    pub summary_max_tokens: u32,
}

impl Default for ContextConfig {
    fn default() -> Self {
        ContextConfig {
            strategy: ContextStrategy::default(),
            budget_chars: 400_000,
            threshold: 0.80,
            summary_max_tokens: 800,
        }
    }
}

// ─── ContextManager ───────────────────────────────────────────────────────────

/// Manages the message history to keep it within the context window.
///
/// Create one per agent run and call `manage()` at the top of every iteration.
pub struct ContextManager {
    pub config: ContextConfig,
    /// How many times we have summarised the context this session.
    pub summarize_count: u32,
}

impl ContextManager {
    /// Create a new manager with the given configuration.
    pub fn new(config: ContextConfig) -> Self {
        ContextManager {
            config,
            summarize_count: 0,
        }
    }

    /// Ensure `messages` fits within the context budget.
    ///
    /// Depending on `self.config.strategy`:
    /// - `Truncate`   → calls the existing simple truncation helper
    /// - `Summarize`  → tries LLM-based summarisation; falls back to truncation on failure
    ///
    /// The `messages` slice is mutated in place.
    pub async fn manage(
        &mut self,
        messages: &mut Vec<Message>,
        llm: &dyn LlmProvider,
    ) -> Result<()> {
        let total_chars: usize = messages
            .iter()
            .map(|m| m.text_content().map(|s| s.len()).unwrap_or(0))
            .sum();

        // Check whether we're above the threshold.
        let trigger_chars = (self.config.budget_chars as f64 * self.config.threshold) as usize;
        if total_chars <= trigger_chars {
            // Still well within budget — nothing to do.
            return Ok(());
        }

        info!(
            total_chars,
            budget_chars = self.config.budget_chars,
            "Context window approaching limit — applying {:?} strategy",
            self.config.strategy,
        );

        match self.config.strategy {
            ContextStrategy::Truncate => {
                truncate_messages(messages, self.config.budget_chars);
            }
            ContextStrategy::Summarize => {
                // Try to summarise; if anything goes wrong, fall back to truncation.
                match self.try_summarize(messages, llm).await {
                    Ok(()) => {
                        self.summarize_count += 1;
                        info!(
                            summarize_count = self.summarize_count,
                            "Context summarised successfully"
                        );
                    }
                    Err(e) => {
                        warn!(
                            error = %e,
                            "Context summarisation failed — falling back to truncation"
                        );
                        truncate_messages(messages, self.config.budget_chars);
                    }
                }
            }
        }

        Ok(())
    }

    /// Attempt LLM-based summarisation of the oldest messages.
    ///
    /// Strategy:
    ///   1. Keep messages[0] (system prompt) and messages.last() (most recent).
    ///   2. Collect the "middle" messages that pushed us over budget.
    ///   3. Send them to the LLM with a summarisation system prompt.
    ///   4. Replace the middle messages with a single `[Context Summary]` user message.
    ///
    /// Returns an error if the LLM call fails (caller falls back to truncation).
    pub async fn try_summarize(
        &self,
        messages: &mut Vec<Message>,
        llm: &dyn LlmProvider,
    ) -> Result<()> {
        if messages.len() < 4 {
            // Not enough messages to summarise meaningfully (system + user + 1 turn = 3).
            // Just truncate instead.
            truncate_messages(messages, self.config.budget_chars);
            return Ok(());
        }

        // Decide how many messages from the beginning (after the system prompt) to
        // hand to the summariser.  We aim to summarise roughly the oldest half of
        // the non-system messages, but at least 2.
        //
        // Example: 20 messages → summarise messages[1..10], keep messages[10..20].
        let non_system_count = messages.len().saturating_sub(1); // exclude system prompt
        let summarise_count = (non_system_count / 2).max(2);
        // The slice to summarise: messages[1 .. 1+summarise_count]
        let summarise_end = 1 + summarise_count;

        // Build the text we will hand to the LLM.
        let context_text: String = messages[1..summarise_end]
            .iter()
            .filter_map(|m| {
                m.text_content().map(|text| {
                    let role_label = match m.role {
                        crate::llm::Role::User => "User",
                        crate::llm::Role::Assistant => "Assistant",
                        crate::llm::Role::System => "System",
                        crate::llm::Role::Tool => "Tool",
                    };
                    format!("[{role_label}]: {text}")
                })
            })
            .collect::<Vec<_>>()
            .join("\n\n");

        // Build a minimal one-shot summarisation conversation.
        // We use a fresh Vec (not the agent's own messages) to keep the call isolated.
        let summarisation_prompt = format!(
            "The following is an excerpt from a coding session conversation. \
             Summarise the key context, decisions, file changes, tool calls, \
             and current state in 500 words or less.  Be factual and concise — \
             this summary will be injected back into the conversation so the \
             agent can continue working without losing important context.\n\n\
             --- Conversation excerpt ---\n\n{context_text}\n\n--- End excerpt ---"
        );
        let summary_request = vec![Message::user(&summarisation_prompt)];

        // Re-use the same provider but with an empty tool list (no tools needed for summary).
        // We don't thread the config's summary_max_tokens through the generic LlmProvider
        // trait — that would require API changes.  Instead we rely on the model's default
        // behaviour for short summarisation tasks (it will naturally produce a short reply).
        let summary_response = llm.chat_completion(&summary_request, &[]).await?;

        let summary_text = summary_response
            .content
            .unwrap_or_else(|| "[Context summary unavailable]".to_string());

        // Replace the summarised slice with a single user message containing the summary.
        // This keeps the conversation as a valid message sequence (user → assistant → ...).
        let summary_message = Message::user(format!(
            "[Context Summary — condensed from {} messages]\n\n{}",
            summarise_count, summary_text
        ));

        // Rebuild: system + summary_message + rest of messages
        let rest = messages.split_off(summarise_end); // messages is now messages[0..summarise_end]
        messages.truncate(1); // keep only the system prompt
        messages.push(summary_message);
        messages.extend(rest);

        Ok(())
    }
}

// ─── Simple truncation (kept as fallback) ────────────────────────────────────

/// Truncate messages to fit within approximate token budget.
///
/// Strategy: always keep system prompt (index 0) + first user message (index 1)
/// + as many messages from the END as fit within `budget_chars`.
///
/// This is the original strategy from `agent/mod.rs`, preserved here as the
/// fallback when LLM summarisation fails or is disabled.
pub fn truncate_messages(messages: &mut Vec<Message>, budget_chars: usize) {
    if messages.is_empty() {
        return;
    }
    let total_chars: usize = messages
        .iter()
        .map(|m| m.text_content().map(|s| s.len()).unwrap_or(0))
        .sum();
    if total_chars <= budget_chars {
        return;
    }

    // Always keep system prompt and first user message.
    let mut keep = vec![];
    if !messages.is_empty() {
        keep.push(messages[0].clone());
    }
    if messages.len() > 1 {
        keep.push(messages[1].clone());
    }

    // Find how many messages from the end fit in the budget.
    let mut chars = keep
        .iter()
        .map(|m| m.text_content().map(|s| s.len()).unwrap_or(0))
        .sum::<usize>();
    let mut tail = Vec::new();
    for m in messages.iter().skip(2).rev() {
        let mc = m.text_content().map(|s| s.len()).unwrap_or(0);
        if chars + mc > budget_chars {
            break;
        }
        tail.push(m.clone());
        chars += mc;
    }
    tail.reverse();

    let truncated_count = messages.len() - (keep.len() + tail.len());
    if truncated_count > 0 {
        warn!("Truncating {} messages for context window", truncated_count);
        keep.push(Message::user(format!(
            "[... {} messages truncated for context window ...]",
            truncated_count
        )));
    }
    keep.extend(tail);
    *messages = keep;
}

// ─── Tests ────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use crate::llm::Message;

    fn make_messages(n: usize) -> Vec<Message> {
        let mut msgs = vec![Message::system("You are a helpful assistant.")];
        for i in 0..n {
            let big = "x".repeat(50_000); // 50k chars each
            if i % 2 == 0 {
                msgs.push(Message::user(format!("user message {}: {}", i, big)));
            } else {
                msgs.push(Message::assistant(
                    Some(format!("assistant reply {}: {}", i, big)),
                    None,
                ));
            }
        }
        msgs
    }

    // ── truncate_messages ──────────────────────────────────────────────────

    #[test]
    fn test_truncate_noop_when_under_budget() {
        let mut msgs = vec![
            Message::system("system"),
            Message::user("hello"),
            Message::assistant(Some("hi".to_string()), None),
        ];
        let original_len = msgs.len();
        truncate_messages(&mut msgs, 400_000);
        assert_eq!(
            msgs.len(),
            original_len,
            "should not truncate when under budget"
        );
    }

    #[test]
    fn test_truncate_keeps_system_and_recent() {
        // 10 messages of 50k chars each = 500k total, budget = 400k
        let mut msgs = make_messages(9); // system + 9 more = 10 messages
        let system_content = msgs[0].text_content().unwrap();

        truncate_messages(&mut msgs, 400_000);

        // System prompt must survive.
        assert_eq!(
            msgs[0].text_content().unwrap(),
            system_content,
            "system prompt must be preserved"
        );
        // Total chars after truncation should fit in budget.
        let total: usize = msgs
            .iter()
            .map(|m| m.text_content().map(|s| s.len()).unwrap_or(0))
            .sum();
        assert!(
            total <= 400_000,
            "total chars after truncation should be ≤ budget, got {total}"
        );
    }

    #[test]
    fn test_truncate_inserts_marker() {
        let mut msgs = make_messages(9);
        truncate_messages(&mut msgs, 400_000);
        // There should be a message containing "[... N messages truncated ...]"
        let has_marker = msgs.iter().any(|m| {
            m.text_content()
                .map(|t| t.contains("messages truncated"))
                .unwrap_or(false)
        });
        assert!(has_marker, "truncated marker message should be present");
    }

    // ── ContextConfig defaults ─────────────────────────────────────────────

    #[test]
    fn test_context_config_defaults() {
        let cfg = ContextConfig::default();
        assert_eq!(cfg.budget_chars, 400_000);
        assert!((cfg.threshold - 0.80).abs() < 1e-9);
        assert_eq!(cfg.strategy, ContextStrategy::Summarize);
        assert_eq!(cfg.summary_max_tokens, 800);
    }

    // ── ContextManager with Truncate strategy ─────────────────────────────

    #[tokio::test]
    async fn test_context_manager_truncate_noop_under_threshold() {
        let cfg = ContextConfig {
            strategy: ContextStrategy::Truncate,
            budget_chars: 400_000,
            threshold: 0.80,
            summary_max_tokens: 800,
        };
        let mut mgr = ContextManager::new(cfg);

        // Small messages — well under budget.
        let mut msgs = vec![Message::system("system"), Message::user("hello")];
        let len_before = msgs.len();

        // We pass a dummy LlmProvider ref — it should never be called since
        // we're under the threshold.  Use a NullProvider for the test.
        struct NullProvider;
        #[async_trait::async_trait]
        impl crate::llm::LlmProvider for NullProvider {
            async fn chat_completion(
                &self,
                _messages: &[crate::llm::Message],
                _tools: &[crate::llm::ToolDefinition],
            ) -> Result<crate::llm::LlmResponse> {
                panic!("should not be called");
            }
        }

        mgr.manage(&mut msgs, &NullProvider).await.unwrap();
        assert_eq!(msgs.len(), len_before, "no-op when under threshold");
    }

    #[tokio::test]
    async fn test_context_manager_truncate_strategy() {
        let cfg = ContextConfig {
            strategy: ContextStrategy::Truncate,
            budget_chars: 100_000,
            threshold: 0.80,
            summary_max_tokens: 800,
        };
        let mut mgr = ContextManager::new(cfg);

        // 3 messages of 50k chars each = 150k total; threshold = 80k.
        let mut msgs = vec![
            Message::system("system"),
            Message::user("x".repeat(50_000)),
            Message::assistant(Some("y".repeat(50_000)), None),
            Message::user("z".repeat(50_000)),
        ];

        struct NullProvider;
        #[async_trait::async_trait]
        impl crate::llm::LlmProvider for NullProvider {
            async fn chat_completion(
                &self,
                _: &[crate::llm::Message],
                _: &[crate::llm::ToolDefinition],
            ) -> Result<crate::llm::LlmResponse> {
                panic!("truncate should not call LLM");
            }
        }

        mgr.manage(&mut msgs, &NullProvider).await.unwrap();

        let total: usize = msgs
            .iter()
            .map(|m| m.text_content().map(|s| s.len()).unwrap_or(0))
            .sum();
        assert!(
            total <= 100_000,
            "should fit within budget after truncation, got {total}"
        );
    }
}