Skip to main content

codetether_agent/session/helper/
compression.rs

1//! Session history compression via the RLM router.
2//!
3//! This module contains the context-window enforcement logic that keeps
4//! the prompt under the model's token budget. It is invoked automatically
5//! at the start of every agent step by [`Session::run_loop`](crate::session::Session).
6//!
7//! ## Strategy
8//!
9//! 1. Estimate the current request token cost (system + messages + tools).
10//! 2. If it exceeds 90% of the model's usable budget, compress the prefix
11//!    of the conversation via [`RlmRouter::auto_process`], keeping the
12//!    most recent `keep_last` messages verbatim.
13//! 3. Progressively shrink `keep_last` (16 → 12 → 8 → 6) until the budget
14//!    is met or nothing more can be compressed.
15//!
16//! The compressed prefix is replaced by a single synthetic assistant
17//! message tagged `[AUTO CONTEXT COMPRESSION]` so the model sees a
18//! coherent summary rather than a truncated tail.
19
20use std::sync::Arc;
21
22use anyhow::Result;
23use chrono::Utc;
24
25use crate::provider::{ContentPart, Message, Role, ToolDefinition};
26use crate::rlm::router::AutoProcessContext;
27use crate::rlm::{RlmChunker, RlmConfig, RlmRouter};
28
29use super::error::messages_to_rlm_context;
30use super::token::{
31    context_window_for_model, estimate_request_tokens, session_completion_max_tokens,
32};
33use crate::session::Session;
34
35/// Progressively smaller `keep_last` values tried by [`enforce_context_window`].
36const KEEP_LAST_CANDIDATES: [usize; 4] = [16, 12, 8, 6];
37
38/// Fraction of the usable budget we target after compression.
39const SAFETY_BUDGET_RATIO: f64 = 0.90;
40
41/// Reserve (in tokens) added on top of `session_completion_max_tokens()` for
42/// tool schemas, protocol framing, and provider-specific wrappers.
43const RESERVE_OVERHEAD_TOKENS: usize = 2048;
44
45/// Fallback chunk-compression target size as a fraction of the ctx window.
46const FALLBACK_CHUNK_RATIO: f64 = 0.25;
47
48/// Compress all messages older than the last `keep_last` into a single
49/// synthetic `[AUTO CONTEXT COMPRESSION]` assistant message.
50///
51/// Returns `Ok(true)` if compression ran, `Ok(false)` if the session was
52/// already short enough to skip.
53pub(crate) async fn compress_history_keep_last(
54    session: &mut Session,
55    provider: Arc<dyn crate::provider::Provider>,
56    model: &str,
57    keep_last: usize,
58    reason: &str,
59) -> Result<bool> {
60    if session.messages.len() <= keep_last {
61        return Ok(false);
62    }
63
64    let split_idx = session.messages.len().saturating_sub(keep_last);
65    let tail = session.messages.split_off(split_idx);
66    let prefix = std::mem::take(&mut session.messages);
67
68    let context = messages_to_rlm_context(&prefix);
69    let ctx_window = context_window_for_model(model);
70
71    let rlm_config = session.metadata.rlm.clone();
72    let auto_ctx = AutoProcessContext {
73        tool_id: "session_context",
74        tool_args: serde_json::json!({"reason": reason}),
75        session_id: &session.id,
76        abort: None,
77        on_progress: None,
78        provider,
79        model: model.to_string(),
80        bus: None,
81        trace_id: None,
82        subcall_provider: session.metadata.subcall_provider.clone(),
83        subcall_model: session.metadata.subcall_model_name.clone(),
84    };
85
86    let summary = match RlmRouter::auto_process(&context, auto_ctx, &rlm_config).await {
87        Ok(result) => {
88            tracing::info!(
89                reason,
90                input_tokens = result.stats.input_tokens,
91                output_tokens = result.stats.output_tokens,
92                compression_ratio = result.stats.compression_ratio,
93                "RLM: Compressed session history"
94            );
95            result.processed
96        }
97        Err(e) => {
98            tracing::warn!(
99                reason,
100                error = %e,
101                "RLM: Failed to compress session history; falling back to chunk compression"
102            );
103            RlmChunker::compress(
104                &context,
105                (ctx_window as f64 * FALLBACK_CHUNK_RATIO) as usize,
106                None,
107            )
108        }
109    };
110
111    let summary_msg = Message {
112        role: Role::Assistant,
113        content: vec![ContentPart::Text {
114            text: format!(
115                "[AUTO CONTEXT COMPRESSION]\nOlder conversation + tool output was compressed \
116                 to fit the model context window.\n\n{summary}"
117            ),
118        }],
119    };
120
121    let mut new_messages = Vec::with_capacity(1 + tail.len());
122    new_messages.push(summary_msg);
123    new_messages.extend(tail);
124    session.messages = new_messages;
125    session.updated_at = Utc::now();
126
127    Ok(true)
128}
129
130/// Ensure the estimated request token cost fits within the model's safety budget.
131///
132/// Invokes [`compress_history_keep_last`] with progressively smaller
133/// `keep_last` values until the estimate is under budget or nothing more
134/// can be compressed.
135pub(crate) async fn enforce_context_window(
136    session: &mut Session,
137    provider: Arc<dyn crate::provider::Provider>,
138    model: &str,
139    system_prompt: &str,
140    tools: &[ToolDefinition],
141) -> Result<()> {
142    let ctx_window = context_window_for_model(model);
143    let reserve = session_completion_max_tokens().saturating_add(RESERVE_OVERHEAD_TOKENS);
144    let budget = ctx_window.saturating_sub(reserve);
145    let safety_budget = (budget as f64 * SAFETY_BUDGET_RATIO) as usize;
146
147    for keep_last in KEEP_LAST_CANDIDATES {
148        let est = estimate_request_tokens(system_prompt, &session.messages, tools);
149        if est <= safety_budget {
150            return Ok(());
151        }
152
153        tracing::info!(
154            est_tokens = est,
155            ctx_window,
156            safety_budget,
157            keep_last,
158            "Context window approaching limit; compressing older session history"
159        );
160
161        let did = compress_history_keep_last(
162            session,
163            Arc::clone(&provider),
164            model,
165            keep_last,
166            "context_budget",
167        )
168        .await?;
169
170        if !did {
171            break;
172        }
173    }
174
175    Ok(())
176}