Skip to main content

lean_ctx/proxy/
compress.rs

1use crate::core::tokens::count_tokens;
2use crate::core::web::distill;
3
4/// Char budget for the research-prose squeeze (~6k tokens). Only oversized prose
5/// is truncated; the squeeze's main job is dedup + blank-collapse, not cutting.
6const RESEARCH_PROSE_CAP: usize = 24_000;
7
8/// Proxy compression funnel: routes a tool result to the right compressor.
9///
10/// 1. Already-cited research output (from `ctx_url_read` / the web layer) is kept
11///    verbatim — it is distilled and citation-stamped, so the shell pipeline must
12///    not touch its footer or claim markers.
13/// 2. Prose results (web fetches, doc reads, research MCP bridges) are squeezed
14///    by the prose-aware research compressor instead of the log/code-tuned shell
15///    engine.
16/// 3. Everything else (shell/build/search output) flows through the unified
17///    `compress_if_beneficial` pipeline. A `$ ...` command hint is extracted so
18///    the pattern engine gets the same routing as the CLI and MCP paths.
19pub fn compress_tool_result(content: &str, tool_name: Option<&str>) -> String {
20    if content.trim().is_empty() || content.len() < 200 {
21        return content.to_string();
22    }
23
24    if is_cited_research_output(content) {
25        return content.to_string();
26    }
27
28    if extract_command_hint(content).is_none() && looks_like_prose(content) {
29        if let Some(out) = squeeze_research_prose(content) {
30            return out;
31        }
32    }
33
34    let cmd = infer_command(content, tool_name);
35    crate::shell::compress::engine::compress_if_beneficial(&cmd, content)
36}
37
38/// True when `content` is a lean-ctx web read: distilled body + citation footer
39/// (`Source: …\nSite: … · Retrieved: …`). Such output is re-compression-hostile.
40fn is_cited_research_output(content: &str) -> bool {
41    content.contains("· Retrieved: ") && content.contains("\nSource: ")
42}
43
44/// Code/shell symbols whose density cleanly separates source/logs from prose.
45const CODE_SYMBOLS: &str = "{}<>;=|\\$`";
46
47/// Conservative prose detector: substantial, letter-dense, low code-symbol, with
48/// real sentences and long lines. Code, logs, tables and JSON all fail this.
49fn looks_like_prose(content: &str) -> bool {
50    let sample: String = content.chars().take(4000).collect();
51    let total = sample.chars().count();
52    if total < 600 {
53        return false;
54    }
55    let total_f = total as f32;
56    let alpha = sample.chars().filter(|c| c.is_alphabetic()).count() as f32;
57    let spaces = sample.chars().filter(|c| *c == ' ').count() as f32;
58    let symbols = sample.chars().filter(|c| CODE_SYMBOLS.contains(*c)).count() as f32;
59
60    if alpha / total_f < 0.6 || spaces / total_f < 0.12 || symbols / total_f > 0.06 {
61        return false;
62    }
63    if sample.matches(['.', '!', '?']).count() < 4 {
64        return false;
65    }
66
67    let non_empty: Vec<&str> = sample.lines().filter(|l| !l.trim().is_empty()).collect();
68    if non_empty.is_empty() {
69        return false;
70    }
71    let avg_len =
72        non_empty.iter().map(|l| l.chars().count()).sum::<usize>() as f32 / non_empty.len() as f32;
73    avg_len >= 40.0
74}
75
76/// Apply the prose squeeze, returning a footer-stamped result only when it
77/// actually saves tokens; otherwise `None` so the normal pipeline can try.
78fn squeeze_research_prose(content: &str) -> Option<String> {
79    let before = count_tokens(content);
80    let squeezed = distill::squeeze_prose(content, RESEARCH_PROSE_CAP);
81    if squeezed.trim().is_empty() {
82        return None;
83    }
84    let after = count_tokens(&squeezed);
85    if after + 2 >= before {
86        return None;
87    }
88    Some(crate::core::protocol::append_savings_with_info(
89        &squeezed,
90        before,
91        after,
92        Some("research"),
93        None,
94    ))
95}
96
97fn infer_command(content: &str, tool_name: Option<&str>) -> String {
98    if let Some(cmd) = extract_command_hint(content) {
99        return cmd;
100    }
101
102    if let Some(name) = tool_name {
103        let nl = name.to_lowercase();
104        if nl.contains("bash") || nl.contains("shell") || nl.contains("terminal") {
105            return "shell".to_string();
106        }
107        if nl.contains("search") || nl.contains("grep") || nl.contains("find") {
108            return "grep".to_string();
109        }
110    }
111
112    String::new()
113}
114
115fn extract_command_hint(content: &str) -> Option<String> {
116    for line in content.lines().take(3) {
117        let trimmed = line.trim();
118        if let Some(cmd) = trimmed.strip_prefix("$ ") {
119            return Some(cmd.to_string());
120        }
121        if let Some(cmd) = trimmed.strip_prefix("% ") {
122            return Some(cmd.to_string());
123        }
124    }
125    None
126}
127
128#[cfg(test)]
129mod tests {
130    use super::*;
131
132    #[test]
133    fn short_content_unchanged() {
134        let short = "hello world";
135        assert_eq!(compress_tool_result(short, None), short);
136    }
137
138    #[test]
139    fn empty_content_unchanged() {
140        assert_eq!(compress_tool_result("", None), "");
141        assert_eq!(compress_tool_result("   ", None), "   ");
142    }
143
144    #[test]
145    fn command_hint_extraction() {
146        assert_eq!(
147            extract_command_hint("$ cargo build\nCompiling foo"),
148            Some("cargo build".to_string())
149        );
150        assert_eq!(extract_command_hint("no prefix here"), None);
151    }
152
153    #[test]
154    fn tool_name_inference() {
155        assert_eq!(infer_command("some text", Some("bash_execute")), "shell");
156        assert_eq!(infer_command("some text", Some("search_files")), "grep");
157        assert_eq!(infer_command("some text", Some("unknown_tool")), "");
158    }
159
160    #[test]
161    fn cited_research_output_is_preserved_verbatim() {
162        let cited = format!(
163            "Rust is a language.\n\n---\nSource: Rust — https://x.com/a\n\
164             Site: x.com · Retrieved: 2026-06-06T00:00:00Z\n{}",
165            "Extra body line that would otherwise be touched. ".repeat(20)
166        );
167        assert_eq!(compress_tool_result(&cited, Some("ctx_url_read")), cited);
168    }
169
170    #[test]
171    fn prose_is_squeezed_and_deduped() {
172        let para = "Rust is a multi-paradigm systems programming language that \
173                    emphasizes performance, type safety, and fearless concurrency, \
174                    achieving memory safety without a garbage collector at runtime.";
175        // Repeated paragraph (well over the 600-char prose floor) → dedup keeps one.
176        let input = format!("{}\n", [para; 8].join("\n\n"));
177        assert!(input.len() > 600);
178        let out = compress_tool_result(&input, Some("web_fetch"));
179        assert_eq!(out.matches("fearless concurrency").count(), 1);
180        assert!(out.contains("performance, type safety"));
181    }
182
183    #[test]
184    fn code_output_is_not_treated_as_prose() {
185        let code = "fn main() {\n    let x = vec![1, 2, 3];\n    \
186                    for i in &x { println!(\"{}\", i); }\n}\n"
187            .repeat(20);
188        assert!(!looks_like_prose(&code));
189    }
190
191    #[test]
192    fn shell_log_is_not_treated_as_prose() {
193        let log = "$ cargo build\n   Compiling foo v0.1.0\n    Finished dev\n".repeat(20);
194        assert!(!looks_like_prose(&log));
195    }
196}