codetether_agent/session/helper/experimental/thinking_prune.rs
1//! Strip extended-thinking blocks from older messages.
2//!
3//! Modern reasoning models (Claude extended thinking, DeepSeek R1,
4//! GPT-5 reasoning, Gemini thought summaries) emit `Thinking` content
5//! parts that can be **10-100×** larger than the assistant's actual
6//! reply. These blocks help the *current* turn's decision but carry
7//! almost no value once the turn has produced its tool calls and the
8//! loop has moved on — the final answer/action already reflects them.
9//!
10//! This module removes [`ContentPart::Thinking`] from every message
11//! older than [`KEEP_LAST_MESSAGES`]. Recent thinking is preserved so
12//! the model can still reference its own recent chain-of-thought.
13//!
14//! # Safety
15//!
16//! * Providers that inject thinking for correctness (cache-coherent
17//! thought signatures on Gemini `ToolCall`) are unaffected — those
18//! signatures live on `ContentPart::ToolCall::thought_signature`,
19//! not on `Thinking` blocks.
20//! * An assistant message whose *only* content was a thinking block
21//! becomes empty; such messages are removed entirely to keep the
22//! buffer a valid provider-consumable shape.
23//!
24//! # Always-on
25//!
26//! No config. Thinking blocks are known to be non-essential after the
27//! turn completes; stripping them is the single highest-ROI shrink for
28//! reasoning-heavy agent loops.
29
30use super::ExperimentalStats;
31use crate::provider::{ContentPart, Message, Role};
32
33/// Keep thinking blocks in this many trailing messages.
34pub const KEEP_LAST_MESSAGES: usize = 4;
35
36/// Strip `Thinking` parts from older messages and drop any messages
37/// that become empty as a result.
38///
39/// # Examples
40///
41/// ```rust
42/// use codetether_agent::provider::{ContentPart, Message, Role};
43/// use codetether_agent::session::helper::experimental::thinking_prune::{
44/// prune_thinking, KEEP_LAST_MESSAGES,
45/// };
46///
47/// let thinking = ContentPart::Thinking {
48/// text: "Let me consider every option...".repeat(100),
49/// };
50/// let reply = ContentPart::Text { text: "Here is my answer.".into() };
51///
52/// let mut msgs = vec![
53/// Message { role: Role::Assistant, content: vec![thinking.clone(), reply.clone()] },
54/// Message { role: Role::Assistant, content: vec![thinking.clone()] }, // thinking-only
55/// ];
56/// for i in 0..KEEP_LAST_MESSAGES + 1 {
57/// msgs.push(Message {
58/// role: Role::User,
59/// content: vec![ContentPart::Text { text: format!("q{i}") }],
60/// });
61/// }
62///
63/// let stats = prune_thinking(&mut msgs);
64/// assert!(stats.total_bytes_saved > 1000);
65/// // Thinking-only message was removed entirely.
66/// // Remaining first message retains the Text reply.
67/// assert!(matches!(&msgs[0].content[..], [ContentPart::Text { .. }]));
68/// ```
69pub fn prune_thinking(messages: &mut Vec<Message>) -> ExperimentalStats {
70 let mut stats = ExperimentalStats::default();
71 let total = messages.len();
72 if total <= KEEP_LAST_MESSAGES {
73 return stats;
74 }
75 let eligible = total - KEEP_LAST_MESSAGES;
76
77 // Prune parts in-place.
78 for msg in messages[..eligible].iter_mut() {
79 if msg.role != Role::Assistant {
80 continue;
81 }
82 let before: usize = msg.content.iter().map(thinking_bytes).sum();
83 msg.content
84 .retain(|p| !matches!(p, ContentPart::Thinking { .. }));
85 let after: usize = msg.content.iter().map(thinking_bytes).sum();
86 let saved = before.saturating_sub(after);
87 if saved > 0 {
88 stats.total_bytes_saved += saved;
89 stats.snippet_hits += 1;
90 }
91 }
92
93 // Drop assistant messages that became empty. Indices in [0, eligible).
94 let mut write = 0;
95 for read in 0..messages.len() {
96 let drop = read < eligible
97 && messages[read].role == Role::Assistant
98 && messages[read].content.is_empty();
99 if !drop {
100 if write != read {
101 messages.swap(write, read);
102 }
103 write += 1;
104 }
105 }
106 messages.truncate(write);
107
108 stats
109}
110
111fn thinking_bytes(p: &ContentPart) -> usize {
112 match p {
113 ContentPart::Thinking { text } => text.len(),
114 _ => 0,
115 }
116}
117
118#[cfg(test)]
119mod tests {
120 use super::*;
121
122 #[test]
123 fn recent_thinking_preserved() {
124 let mut msgs = vec![Message {
125 role: Role::Assistant,
126 content: vec![ContentPart::Thinking {
127 text: "x".repeat(5000),
128 }],
129 }];
130 let stats = prune_thinking(&mut msgs);
131 assert_eq!(stats.total_bytes_saved, 0);
132 }
133
134 #[test]
135 fn empty_assistant_dropped() {
136 let mut msgs = vec![Message {
137 role: Role::Assistant,
138 content: vec![ContentPart::Thinking {
139 text: "x".repeat(1000),
140 }],
141 }];
142 for i in 0..KEEP_LAST_MESSAGES + 1 {
143 msgs.push(Message {
144 role: Role::User,
145 content: vec![ContentPart::Text {
146 text: format!("q{i}"),
147 }],
148 });
149 }
150 let before = msgs.len();
151 prune_thinking(&mut msgs);
152 assert_eq!(msgs.len(), before - 1);
153 }
154
155 #[test]
156 fn user_thinking_untouched() {
157 // Users can't really emit thinking, but defense-in-depth.
158 let mut msgs = vec![Message {
159 role: Role::User,
160 content: vec![ContentPart::Thinking {
161 text: "user-thought".repeat(100),
162 }],
163 }];
164 for i in 0..KEEP_LAST_MESSAGES + 1 {
165 msgs.push(Message {
166 role: Role::Assistant,
167 content: vec![ContentPart::Text {
168 text: format!("r{i}"),
169 }],
170 });
171 }
172 let stats = prune_thinking(&mut msgs);
173 assert_eq!(stats.total_bytes_saved, 0);
174 }
175}