codetether_agent/session/helper/
compression.rs1use std::sync::Arc;
21
22use anyhow::Result;
23use chrono::Utc;
24
25use crate::provider::{ContentPart, Message, Role, ToolDefinition};
26use crate::rlm::router::AutoProcessContext;
27use crate::rlm::{RlmChunker, RlmConfig, RlmRouter};
28
29use super::error::messages_to_rlm_context;
30use super::token::{
31 context_window_for_model, estimate_request_tokens, session_completion_max_tokens,
32};
33use crate::session::Session;
34
35const KEEP_LAST_CANDIDATES: [usize; 4] = [16, 12, 8, 6];
37
38const SAFETY_BUDGET_RATIO: f64 = 0.90;
40
41const RESERVE_OVERHEAD_TOKENS: usize = 2048;
44
45const FALLBACK_CHUNK_RATIO: f64 = 0.25;
47
48pub(crate) async fn compress_history_keep_last(
54 session: &mut Session,
55 provider: Arc<dyn crate::provider::Provider>,
56 model: &str,
57 keep_last: usize,
58 reason: &str,
59) -> Result<bool> {
60 if session.messages.len() <= keep_last {
61 return Ok(false);
62 }
63
64 let split_idx = session.messages.len().saturating_sub(keep_last);
65 let tail = session.messages.split_off(split_idx);
66 let prefix = std::mem::take(&mut session.messages);
67
68 let context = messages_to_rlm_context(&prefix);
69 let ctx_window = context_window_for_model(model);
70
71 let rlm_config = session.metadata.rlm.clone();
72 let auto_ctx = AutoProcessContext {
73 tool_id: "session_context",
74 tool_args: serde_json::json!({"reason": reason}),
75 session_id: &session.id,
76 abort: None,
77 on_progress: None,
78 provider,
79 model: model.to_string(),
80 bus: None,
81 trace_id: None,
82 subcall_provider: session.metadata.subcall_provider.clone(),
83 subcall_model: session.metadata.subcall_model_name.clone(),
84 };
85
86 let summary = match RlmRouter::auto_process(&context, auto_ctx, &rlm_config).await {
87 Ok(result) => {
88 tracing::info!(
89 reason,
90 input_tokens = result.stats.input_tokens,
91 output_tokens = result.stats.output_tokens,
92 compression_ratio = result.stats.compression_ratio,
93 "RLM: Compressed session history"
94 );
95 result.processed
96 }
97 Err(e) => {
98 tracing::warn!(
99 reason,
100 error = %e,
101 "RLM: Failed to compress session history; falling back to chunk compression"
102 );
103 RlmChunker::compress(
104 &context,
105 (ctx_window as f64 * FALLBACK_CHUNK_RATIO) as usize,
106 None,
107 )
108 }
109 };
110
111 let summary_msg = Message {
112 role: Role::Assistant,
113 content: vec![ContentPart::Text {
114 text: format!(
115 "[AUTO CONTEXT COMPRESSION]\nOlder conversation + tool output was compressed \
116 to fit the model context window.\n\n{summary}"
117 ),
118 }],
119 };
120
121 let mut new_messages = Vec::with_capacity(1 + tail.len());
122 new_messages.push(summary_msg);
123 new_messages.extend(tail);
124 session.messages = new_messages;
125 session.updated_at = Utc::now();
126
127 Ok(true)
128}
129
130pub(crate) async fn enforce_context_window(
136 session: &mut Session,
137 provider: Arc<dyn crate::provider::Provider>,
138 model: &str,
139 system_prompt: &str,
140 tools: &[ToolDefinition],
141) -> Result<()> {
142 let ctx_window = context_window_for_model(model);
143 let reserve = session_completion_max_tokens().saturating_add(RESERVE_OVERHEAD_TOKENS);
144 let budget = ctx_window.saturating_sub(reserve);
145 let safety_budget = (budget as f64 * SAFETY_BUDGET_RATIO) as usize;
146
147 for keep_last in KEEP_LAST_CANDIDATES {
148 let est = estimate_request_tokens(system_prompt, &session.messages, tools);
149 if est <= safety_budget {
150 return Ok(());
151 }
152
153 tracing::info!(
154 est_tokens = est,
155 ctx_window,
156 safety_budget,
157 keep_last,
158 "Context window approaching limit; compressing older session history"
159 );
160
161 let did = compress_history_keep_last(
162 session,
163 Arc::clone(&provider),
164 model,
165 keep_last,
166 "context_budget",
167 )
168 .await?;
169
170 if !did {
171 break;
172 }
173 }
174
175 Ok(())
176}