1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
mod compaction;
#[cfg(test)]
mod tests;
use crate::api::content::Content;
use crate::api::models::Message;
pub(super) const MAX_COMPACTION_LOG: usize = 20;
/// Manages conversation history with token-aware context window.
#[derive(Clone)]
pub struct ConversationContext {
messages: Vec<Message>,
system_prompt: String,
/// Approximate max tokens for the entire context.
max_context_tokens: usize,
/// Trigger compaction at this fraction of budget.
compaction_threshold: f32,
/// When true, adaptively adjusts compaction_threshold based on conversation characteristics.
adaptive_compaction: bool,
/// Last reasoning_content from LLM (preserved thinking).
last_reasoning: Option<String>,
/// Compaction history (for status display).
compaction_log: Vec<CompactionEvent>,
/// Actual prompt token count from the last API response (None until first call).
last_actual_prompt_tokens: Option<usize>,
/// Number of messages present when last_actual_prompt_tokens was recorded.
/// Used to estimate only the delta for messages added after the last API call.
messages_at_last_api_call: usize,
/// Running sum of estimated tokens for all messages (incremental, O(1) per push).
/// Recomputed from scratch after compaction.
cached_msg_tokens: usize,
/// Count of tool/tool_call messages — enables O(1) tool density in adaptive_threshold().
cached_tool_count: usize,
/// Pre-built system message content (system_prompt + optional reasoning).
/// Recomputed whenever system_prompt or last_reasoning changes to avoid
/// repeated `format!()` calls in the hot path of build_messages().
cached_system_content: String,
/// Preserved implementation decisions extracted before compaction.
/// Never compacted away — injected into every system message.
decision_log: Vec<String>,
}
/// Record of a compaction event.
#[derive(Debug, Clone)]
pub struct CompactionEvent {
/// Number of messages before compaction.
pub before_messages: usize,
/// Number of messages after compaction.
pub after_messages: usize,
/// Estimated tokens before.
pub before_tokens: usize,
/// Estimated tokens after.
pub after_tokens: usize,
/// Summary generated.
pub summary_preview: String,
}
impl ConversationContext {
pub fn new(system_prompt: String) -> Self {
Self::with_budget(system_prompt, 200_000, 0.8)
}
/// Create with custom token budget and compaction threshold.
pub fn with_budget(
system_prompt: String,
max_context_tokens: usize,
compaction_threshold: f32,
) -> Self {
let cached_system_content = system_prompt.clone();
Self {
messages: Vec::new(),
system_prompt,
max_context_tokens,
compaction_threshold,
adaptive_compaction: true,
last_reasoning: None,
compaction_log: Vec::new(),
last_actual_prompt_tokens: None,
messages_at_last_api_call: 0,
cached_msg_tokens: 0,
cached_tool_count: 0,
cached_system_content,
decision_log: Vec::new(),
}
}
/// Build the system content string from prompt + optional reasoning (shared helper).
fn make_system_content(system_prompt: &str, reasoning: Option<&str>) -> String {
match reasoning {
Some(r) => format!(
"{}\n\n## Previous Reasoning (preserved)\n\n<reasoning>\n{}\n</reasoning>",
system_prompt, r
),
None => system_prompt.to_string(),
}
}
/// Restore a context from a saved session snapshot using the default budget.
///
/// Used by headless/ACP paths. For TUI sessions use `restore_with_budget`.
pub fn restore(
system_prompt: String,
messages: Vec<Message>,
last_reasoning: Option<String>,
) -> Self {
let cached_msg_tokens = messages.iter().map(Self::estimate_message_tokens).sum();
let cached_tool_count = messages
.iter()
.filter(|m| m.role == "tool" || m.tool_calls.is_some())
.count();
let cached_system_content =
Self::make_system_content(&system_prompt, last_reasoning.as_deref());
Self {
messages,
system_prompt,
max_context_tokens: 200_000,
compaction_threshold: 0.8,
adaptive_compaction: true,
last_reasoning,
compaction_log: Vec::new(),
last_actual_prompt_tokens: None,
messages_at_last_api_call: 0,
cached_msg_tokens,
cached_tool_count,
cached_system_content,
decision_log: Vec::new(),
}
}
/// Restore with custom budget.
pub fn restore_with_budget(
system_prompt: String,
messages: Vec<Message>,
last_reasoning: Option<String>,
max_context_tokens: usize,
compaction_threshold: f32,
) -> Self {
let cached_msg_tokens = messages.iter().map(Self::estimate_message_tokens).sum();
let cached_tool_count = messages
.iter()
.filter(|m| m.role == "tool" || m.tool_calls.is_some())
.count();
let cached_system_content =
Self::make_system_content(&system_prompt, last_reasoning.as_deref());
Self {
messages,
system_prompt,
max_context_tokens,
compaction_threshold,
adaptive_compaction: true,
last_reasoning,
compaction_log: Vec::new(),
last_actual_prompt_tokens: None,
messages_at_last_api_call: 0,
cached_msg_tokens,
cached_tool_count,
cached_system_content,
decision_log: Vec::new(),
}
}
/// Get the max context tokens setting.
pub fn max_context_tokens(&self) -> usize {
self.max_context_tokens
}
/// Get token count currently in context.
/// Uses actual API-reported count when available, falls back to heuristic estimate.
pub fn used_tokens(&self) -> usize {
self.actual_used_tokens()
}
/// Update the context with the actual prompt token count returned by the API.
/// Call immediately after receiving `StreamEvent::Done { prompt_tokens, .. }`.
pub fn update_actual_tokens(&mut self, prompt_tokens: u32) {
self.last_actual_prompt_tokens = Some(prompt_tokens as usize);
self.messages_at_last_api_call = self.messages.len();
}
/// Returns the actual prompt token count if available, otherwise falls back to
/// the heuristic estimate. When actual data is available, only the delta for
/// messages added since the last API call is estimated and added to the baseline.
///
/// If the message set shrank since the last API call (e.g., compaction ran
/// without resetting the baseline), the stale baseline is discarded and we
/// fall back to a fresh estimate. The final value is also lower-bounded by
/// the current heuristic estimate so large in-flight additions are reflected
/// in the UI even before the next `Done` event refreshes the baseline.
fn actual_used_tokens(&self) -> usize {
let estimate = self.estimate_total_tokens();
match self.last_actual_prompt_tokens {
None => estimate,
Some(base) => {
if self.messages.len() < self.messages_at_last_api_call {
return estimate;
}
let new_count = self.messages.len() - self.messages_at_last_api_call;
let new_tokens: usize = self
.messages
.iter()
.rev()
.take(new_count)
.map(Self::estimate_message_tokens)
.sum();
(base + new_tokens).max(estimate)
}
}
}
pub fn system_prompt(&self) -> &str {
&self.system_prompt
}
pub fn push(&mut self, message: Message) {
if message.role == "tool" || message.tool_calls.is_some() {
self.cached_tool_count += 1;
}
self.cached_msg_tokens += Self::estimate_message_tokens(&message);
self.messages.push(message);
self.compact_if_needed();
}
/// Push a message and return whether compaction was triggered.
pub fn push_and_report(&mut self, message: Message) -> bool {
let before = self.compaction_log.len();
if message.role == "tool" || message.tool_calls.is_some() {
self.cached_tool_count += 1;
}
self.cached_msg_tokens += Self::estimate_message_tokens(&message);
self.messages.push(message);
self.compact_if_needed();
self.compaction_log.len() > before
}
pub fn messages(&self) -> &[Message] {
&self.messages
}
/// Store reasoning content from the last LLM response.
pub fn set_last_reasoning(&mut self, reasoning: String) {
self.cached_system_content =
Self::make_system_content(&self.system_prompt, Some(&reasoning));
self.last_reasoning = Some(reasoning);
}
/// How many compactions have occurred.
pub fn compaction_count(&self) -> usize {
self.compaction_log.len()
}
/// Enable or disable adaptive compaction threshold adjustment.
pub fn set_adaptive_compaction(&mut self, enabled: bool) {
self.adaptive_compaction = enabled;
}
/// Get compaction history.
pub fn compaction_log(&self) -> &[CompactionEvent] {
&self.compaction_log
}
/// Build the full message list including system prompt and preserved reasoning.
pub fn build_messages(&self) -> Vec<Message> {
// Inject preserved decisions into the system message so they survive compaction.
let system_content = if self.decision_log.is_empty() {
self.cached_system_content.clone()
} else {
let entries = self
.decision_log
.iter()
.map(|d| format!("- {d}"))
.collect::<Vec<_>>()
.join("\n");
format!(
"{}\n\n## Implementation Decisions (preserved — do not re-decide)\n\n{}",
self.cached_system_content, entries
)
};
let mut msgs = Vec::with_capacity(self.messages.len() + 1);
msgs.push(Message {
role: "system".to_string(),
content: Some(Content::text(&system_content)),
reasoning_content: None,
tool_calls: None,
tool_call_id: None,
});
msgs.extend(self.messages.iter().cloned());
msgs
}
/// Record an implementation decision to the persistent log.
///
/// Decisions are injected into the system prompt and never compacted away,
/// preserving implementation continuity across long tasks.
pub fn record_decision(&mut self, decision: String) {
if !self.decision_log.iter().any(|d| d == &decision) {
self.decision_log.push(decision);
}
if self.decision_log.len() > 20 {
let drain_to = self.decision_log.len() - 20;
self.decision_log.drain(..drain_to);
}
}
/// Returns the implementation decision log (injected into system prompt after compaction).
pub fn decision_log(&self) -> &[String] {
&self.decision_log
}
/// Estimate token count for a message (~4 chars per token for code).
fn estimate_message_tokens(msg: &Message) -> usize {
let base = match &msg.content {
Some(Content::Text(s)) => s.len() / 4 + 10,
Some(Content::Parts(parts)) => {
let mut total = 0;
for part in parts {
match part {
crate::api::ContentPart::Text { text } => {
total += text.len() / 4 + 10;
}
crate::api::ContentPart::ImageUrl { image_url } => {
// Vision API 토큰 추정 (OpenAI 기준)
// low detail: 85 tokens
// high detail: 최대 1105 tokens (2048x2048 기준)
total += match image_url.detail.as_deref() {
Some("low") => 85,
Some("high") => 1105,
_ => {
// base64 URL 길이로 이미지 크기 추정
// ~150KB base64 ≈ ~110KB raw → low detail 영역
if image_url.url.len() < 150_000 {
85
} else {
512
}
}
};
}
}
}
total
}
None => 0,
};
// tool_calls 등 추가 고려
base + msg.tool_calls.as_ref().map(|t| t.len() * 20).unwrap_or(0)
}
/// Estimate total tokens in the conversation.
///
/// Uses the incrementally-maintained `cached_msg_tokens` (updated on every
/// push) instead of re-summing the full message list each call.
pub fn estimate_total_tokens(&self) -> usize {
let system_tokens = self.system_prompt.len() / 4 + 10;
system_tokens + self.cached_msg_tokens
}
/// Compute adaptive compaction threshold based on conversation characteristics.
///
/// Tool-heavy conversations compact earlier (more tokens per turn),
/// while discussion-heavy conversations can use more of the window.
///
/// Produces thresholds in the range 0.60–0.85:
/// - High tool density (>60%): 0.60–0.65 (compact aggressively)
/// - Moderate tool density (>35%): 0.65–0.70
/// - Conversational (<15% tools): 0.80–0.85 (keep more context)
fn adaptive_threshold(&self) -> f32 {
// If adaptive is disabled, use the configured threshold strictly.
if !self.adaptive_compaction || self.messages.len() < 6 {
return self.compaction_threshold;
}
// Measure tool density: fraction of messages that are tool calls/results
let tool_density = self.cached_tool_count as f32 / self.messages.len() as f32;
// Measure avg turn size in tokens (larger turns = need earlier compaction)
let avg_msg_tokens = self.cached_msg_tokens as f32 / self.messages.len().max(1) as f32;
let base = self.compaction_threshold;
// Adjustment from tool density (primary factor)
let density_adj: f32 = if tool_density > 0.6 {
-0.13 // Heavy tool use: compact significantly earlier
} else if tool_density > 0.35 {
-0.07 // Moderate tool use
} else if tool_density > 0.15 {
0.0 // Mixed: use base as-is
} else {
0.05 // Mostly conversational: allow more context
};
// Size adjustment: large average message size pushes threshold down
let size_adj: f32 = if avg_msg_tokens > 500.0 {
-0.05 // Very large turns (file reads, long tool output)
} else if avg_msg_tokens > 200.0 {
-0.02
} else {
0.0
};
(base + density_adj + size_adj).clamp(0.40, 0.90)
}
/// Adaptive compaction target: how low to compress based on conversation type.
///
/// Tool-heavy conversations need more aggressive compression because tool
/// outputs dominate token usage. Conversational contexts can be lighter.
fn adaptive_target_ratio(&self) -> f64 {
let tool_density = if self.messages.is_empty() {
0.0
} else {
self.cached_tool_count as f64 / self.messages.len() as f64
};
if tool_density > 0.6 {
0.20 // Heavy tool use → compress to 20%
} else if tool_density > 0.35 {
0.25 // Moderate → 25%
} else {
0.30 // Conversational → 30%
}
}
// =====================================================================
// Multi-Pass Compaction Pipeline
//
// Based on research from Microsoft Agent Framework, JetBrains, OpenCode:
// Pass 0: Observation masking (replace old tool outputs with placeholders)
// Pass 1: SimHash dedup (collapse near-identical tool results)
// Pass 2: Smart relevance-aware compaction (IDF scoring, structured summary)
// Pass 3: Fallback fixed-window truncation
//
// Each pass is progressively more aggressive. Pipeline stops early when
// the target token budget is reached.
// =====================================================================
/// Extract implementation decisions from a message range before they are compacted away.
///
/// Scans assistant messages for decision-patterned sentences and saves them
/// to `decision_log` so they persist in the system prompt after compaction.
fn preserve_decisions_from_range(&mut self, start: usize, end: usize) {
let end = end.min(self.messages.len());
if start >= end {
return;
}
let mut extracted: Vec<String> = Vec::new();
for msg in &self.messages[start..end] {
if msg.role != "assistant" {
continue;
}
let text = match &msg.content {
Some(c) => c.text_content(),
None => continue,
};
for line in text.lines().take(40) {
let t = line.trim();
if t.len() < 25 || t.len() > 400 {
continue;
}
let is_decision = t.starts_with("I'll ")
|| t.starts_with("I will ")
|| t.starts_with("The fix ")
|| t.starts_with("The approach ")
|| t.starts_with("Decided to ")
|| t.starts_with("The solution ")
|| (t.contains(" because ") && t.len() > 50);
if is_decision {
let entry = crate::util::truncate_bytes(t, 300).to_string();
if !self.decision_log.iter().any(|d| d == &entry)
&& !extracted.iter().any(|d| d == &entry)
{
extracted.push(entry);
}
}
}
}
self.decision_log.extend(extracted);
if self.decision_log.len() > 30 {
let drain_to = self.decision_log.len() - 30;
self.decision_log.drain(..drain_to);
}
}
pub fn compaction_quality(&self) -> Option<crate::bench::CompactionQuality> {
let last = self.compaction_log.last()?;
let token_reduction = if last.before_tokens > 0 {
last.after_tokens as f32 / last.before_tokens as f32
} else {
1.0
};
// Count file paths, decisions, errors in current messages
let mut files_in_context = 0u32;
let mut decisions_in_context = 0u32;
let mut errors_in_context = 0u32;
for msg in &self.messages {
if let Some(ref content) = msg.content {
let text = content.text_content();
files_in_context += text
.lines()
.filter(|l| l.contains('/') && l.contains('.') && !l.starts_with("http"))
.count() as u32;
decisions_in_context += text
.lines()
.filter(|l| {
let t = l.trim();
t.starts_with("I'll ")
|| t.starts_with("Let me ")
|| t.starts_with("I need to ")
|| t.starts_with("Files Modified")
|| t.starts_with("Key Decisions")
})
.count() as u32;
if text.starts_with("Error:") || text.starts_with("error:") {
errors_in_context += 1;
}
if text.contains("Errors Encountered") {
errors_in_context += 1;
}
}
}
let msg_ratio = if last.before_messages > 0 {
last.before_messages as f32
} else {
1.0
};
Some(crate::bench::CompactionQuality {
preserved_files: (files_in_context as f32 / msg_ratio).min(1.0),
preserved_decisions: (decisions_in_context as f32 / (msg_ratio * 0.3)).min(1.0),
preserved_errors: (errors_in_context as f32 / (msg_ratio * 0.1).max(1.0)).min(1.0),
token_reduction,
})
}
/// Remaining token budget estimate (logged once per agent loop iteration).
pub fn budget_remaining(&self) -> usize {
self.max_context_tokens
.saturating_sub(self.estimate_total_tokens())
}
/// Preemptively run threshold-based compaction without pushing a new message.
/// Returns `true` if compaction actually ran (i.e., a new entry was appended
/// to the compaction log). Safe to call before building an API request —
/// it only acts when the configured threshold is crossed.
pub fn maybe_compact(&mut self) -> bool {
let before = self.compaction_log.len();
self.compact_if_needed();
self.compaction_log.len() > before
}
/// Update the system prompt (e.g., after repo map rebuild).
pub fn update_system_prompt(&mut self, new_prompt: String) {
self.system_prompt = new_prompt;
self.cached_system_content =
Self::make_system_content(&self.system_prompt, self.last_reasoning.as_deref());
}
/// Append content to the system prompt (e.g., skill metadata).
pub fn append_system_prompt(&mut self, content: &str) {
self.system_prompt.push_str("\n\n");
self.system_prompt.push_str(content);
self.cached_system_content =
Self::make_system_content(&self.system_prompt, self.last_reasoning.as_deref());
}
/// Clear all messages and reset token tracking (keep system prompt).
pub fn clear(&mut self) {
self.messages.clear();
self.cached_msg_tokens = 0;
self.cached_tool_count = 0;
self.last_actual_prompt_tokens = None;
self.messages_at_last_api_call = 0;
self.last_reasoning = None;
self.cached_system_content = self.system_prompt.clone();
}
/// Force compaction regardless of threshold.
///
/// Uses the same multi-pass pipeline as auto-compaction but bypasses
/// the threshold check. Targets 25% of max context.
pub fn force_compact(&mut self) {
// Need at least system prompt + 1 user/assistant pair to compact
if self.messages.len() <= 2 {
return;
}
tracing::info!(
"Force compaction requested: ~{} messages",
self.messages.len()
);
let target_ratio = self.adaptive_target_ratio();
let target_tokens = (self.max_context_tokens as f64 * target_ratio) as usize;
self.run_compaction_pipeline(target_tokens);
}
}