1use crate::error::Result;
11
12#[derive(Debug, Clone)]
14pub struct StageCandidate {
15 pub name: String,
17 pub overhead_tokens: u32,
19 pub savings_tokens: u32,
21 pub applicable: bool,
23}
24
25impl StageCandidate {
26 pub fn net_benefit(&self) -> i32 {
28 self.savings_tokens as i32 - self.overhead_tokens as i32
29 }
30}
31
32#[derive(Debug, Clone)]
34pub struct MdlSelection {
35 pub enabled_stages: Vec<String>,
37 pub skipped_stages: Vec<String>,
39 pub estimated_net_savings: i32,
41}
42
43#[derive(Debug, Clone)]
45pub struct ContentProfile {
46 pub is_json: bool,
48 pub length: usize,
50 pub tokens: u32,
52 pub has_repetition: bool,
54 pub is_diff: bool,
56 pub is_prose: bool,
58 pub is_log: bool,
60 pub null_count: usize,
62 pub array_element_count: usize,
64}
65
66pub fn select_stages(profile: &ContentProfile) -> MdlSelection {
72 let mut candidates = build_candidates(profile);
73
74 candidates.sort_by(|a, b| b.net_benefit().cmp(&a.net_benefit()));
76
77 let mut enabled = Vec::new();
78 let mut skipped = Vec::new();
79 let mut total_net = 0i32;
80
81 for candidate in &candidates {
82 if !candidate.applicable {
83 skipped.push(candidate.name.clone());
84 continue;
85 }
86
87 if candidate.net_benefit() > 0 {
88 enabled.push(candidate.name.clone());
89 total_net += candidate.net_benefit();
90 } else {
91 skipped.push(candidate.name.clone());
92 }
93 }
94
95 if !enabled.contains(&"ansi_strip".to_string()) {
97 enabled.insert(0, "ansi_strip".to_string());
98 }
99
100 MdlSelection {
101 enabled_stages: enabled,
102 skipped_stages: skipped,
103 estimated_net_savings: total_net,
104 }
105}
106
107fn build_candidates(p: &ContentProfile) -> Vec<StageCandidate> {
109 vec![
110 StageCandidate {
111 name: "strip_nulls".to_string(),
112 overhead_tokens: 0,
113 savings_tokens: if p.is_json { (p.null_count as u32) * 3 } else { 0 },
114 applicable: p.is_json && p.null_count > 0,
115 },
116 StageCandidate {
117 name: "condense".to_string(),
118 overhead_tokens: 0,
119 savings_tokens: if p.has_repetition { p.tokens / 4 } else { 0 },
120 applicable: !p.is_json && p.has_repetition,
121 },
122 StageCandidate {
123 name: "git_diff_fold".to_string(),
124 overhead_tokens: 2, savings_tokens: if p.is_diff { p.tokens / 3 } else { 0 },
126 applicable: p.is_diff,
127 },
128 StageCandidate {
129 name: "collapse_arrays".to_string(),
130 overhead_tokens: 5, savings_tokens: if p.is_json && p.array_element_count > 10 {
132 (p.array_element_count as u32 - 5) * 4
133 } else {
134 0
135 },
136 applicable: p.is_json && p.array_element_count > 10,
137 },
138 StageCandidate {
139 name: "flatten".to_string(),
140 overhead_tokens: 0,
141 savings_tokens: if p.is_json { p.tokens / 20 } else { 0 },
142 applicable: p.is_json && p.tokens > 50,
143 },
144 StageCandidate {
145 name: "truncate_strings".to_string(),
146 overhead_tokens: 1, savings_tokens: if p.is_json { p.tokens / 10 } else { 0 },
148 applicable: p.is_json && p.tokens > 100,
149 },
150 StageCandidate {
151 name: "rle".to_string(),
152 overhead_tokens: 3, savings_tokens: if p.has_repetition && !p.is_json { p.tokens / 5 } else { 0 },
154 applicable: !p.is_json && p.has_repetition && p.length > 200,
155 },
156 StageCandidate {
157 name: "sliding_window_dedup".to_string(),
158 overhead_tokens: 2, savings_tokens: if p.has_repetition && !p.is_json { p.tokens / 8 } else { 0 },
160 applicable: !p.is_json && p.has_repetition && p.length > 300,
161 },
162 StageCandidate {
163 name: "entropy_truncate".to_string(),
164 overhead_tokens: 3, savings_tokens: if p.is_prose && p.tokens > 100 { p.tokens / 6 } else { 0 },
166 applicable: !p.is_json && p.is_prose && p.length > 500,
167 },
168 StageCandidate {
169 name: "token_prune".to_string(),
170 overhead_tokens: 0,
171 savings_tokens: if p.is_prose { p.tokens / 10 } else { 0 },
172 applicable: !p.is_json && p.is_prose && p.length > 100,
173 },
174 StageCandidate {
175 name: "dict_compress".to_string(),
176 overhead_tokens: 15, savings_tokens: if p.is_json && p.tokens > 50 { p.tokens / 8 } else { 0 },
178 applicable: p.is_json && p.tokens > 120, },
180 StageCandidate {
181 name: "toon_encode".to_string(),
182 overhead_tokens: 2, savings_tokens: if p.is_json { p.tokens / 5 } else { 0 },
184 applicable: p.is_json,
185 },
186 StageCandidate {
187 name: "textrank".to_string(),
188 overhead_tokens: 0,
189 savings_tokens: if p.is_prose && p.tokens > 200 { p.tokens / 3 } else { 0 },
190 applicable: p.is_prose && p.tokens > 200,
191 },
192 ]
193}
194
195pub fn profile_content(text: &str) -> ContentProfile {
197 let is_json = text.trim().starts_with('{') || text.trim().starts_with('[');
198 let lines: Vec<&str> = text.lines().collect();
199 let tokens = (text.len() as u32 + 3) / 4;
200
201 let mut has_repetition = false;
203 if lines.len() > 2 {
204 let mut prev = "";
205 let mut run = 0;
206 for line in &lines {
207 if *line == prev {
208 run += 1;
209 if run >= 2 {
210 has_repetition = true;
211 break;
212 }
213 } else {
214 run = 0;
215 }
216 prev = line;
217 }
218 }
219
220 let is_diff = text.contains("\n+") && text.contains("\n-") && text.contains("@@");
221 let is_log = lines.iter().take(10).any(|l| {
222 l.contains("[INFO]") || l.contains("[ERROR]") || l.contains("[WARN]")
223 });
224
225 let mut prose_lines = 0;
227 let mut code_lines = 0;
228 for line in lines.iter().take(20) {
229 let t = line.trim();
230 if t.is_empty() { continue; }
231 if t.ends_with('{') || t.ends_with(';') || t.contains("::") || t.contains("->") {
232 code_lines += 1;
233 } else {
234 prose_lines += 1;
235 }
236 }
237 let is_prose = prose_lines > code_lines && !is_json && !is_diff && !is_log;
238
239 let null_count = if is_json { text.matches(":null").count() + text.matches(": null").count() } else { 0 };
241 let array_element_count = if is_json {
242 text.matches('{').count().saturating_sub(1) } else {
244 0
245 };
246
247 ContentProfile {
248 is_json,
249 length: text.len(),
250 tokens,
251 has_repetition,
252 is_diff,
253 is_prose,
254 is_log,
255 null_count,
256 array_element_count,
257 }
258}
259
260#[cfg(test)]
263mod tests {
264 use super::*;
265
266 #[test]
267 fn test_json_profile() {
268 let p = profile_content(r#"{"a":1,"b":null,"c":null}"#);
269 assert!(p.is_json);
270 assert_eq!(p.null_count, 2);
271 assert!(!p.is_prose);
272 }
273
274 #[test]
275 fn test_prose_profile() {
276 let p = profile_content("This is a normal sentence about something interesting and important.");
277 assert!(p.is_prose);
278 assert!(!p.is_json);
279 }
280
281 #[test]
282 fn test_diff_profile() {
283 let p = profile_content("@@ -1,5 +1,5 @@\n-old\n+new\n context\n");
284 assert!(p.is_diff);
285 }
286
287 #[test]
288 fn test_log_profile() {
289 let p = profile_content("[INFO] Started\n[ERROR] Failed\n[WARN] Slow\n");
290 assert!(p.is_log);
291 }
292
293 #[test]
294 fn test_select_stages_json() {
295 let p = ContentProfile {
296 is_json: true,
297 length: 500,
298 tokens: 125,
299 has_repetition: false,
300 is_diff: false,
301 is_prose: false,
302 is_log: false,
303 null_count: 5,
304 array_element_count: 0,
305 };
306 let sel = select_stages(&p);
307 assert!(sel.enabled_stages.contains(&"strip_nulls".to_string()));
308 assert!(sel.enabled_stages.contains(&"toon_encode".to_string()));
309 assert!(!sel.enabled_stages.contains(&"condense".to_string()));
310 }
311
312 #[test]
313 fn test_select_stages_prose() {
314 let p = ContentProfile {
315 is_json: false,
316 length: 1000,
317 tokens: 250,
318 has_repetition: false,
319 is_diff: false,
320 is_prose: true,
321 is_log: false,
322 null_count: 0,
323 array_element_count: 0,
324 };
325 let sel = select_stages(&p);
326 assert!(sel.enabled_stages.contains(&"token_prune".to_string()));
327 assert!(sel.enabled_stages.contains(&"textrank".to_string()));
328 assert!(!sel.enabled_stages.contains(&"strip_nulls".to_string()));
329 }
330
331 #[test]
332 fn test_select_stages_always_includes_ansi_strip() {
333 let p = ContentProfile {
334 is_json: false,
335 length: 10,
336 tokens: 3,
337 has_repetition: false,
338 is_diff: false,
339 is_prose: false,
340 is_log: false,
341 null_count: 0,
342 array_element_count: 0,
343 };
344 let sel = select_stages(&p);
345 assert!(sel.enabled_stages.contains(&"ansi_strip".to_string()));
346 }
347
348 #[test]
349 fn test_select_stages_skips_overhead_for_small_json() {
350 let p = ContentProfile {
351 is_json: true,
352 length: 50,
353 tokens: 12,
354 has_repetition: false,
355 is_diff: false,
356 is_prose: false,
357 is_log: false,
358 null_count: 0,
359 array_element_count: 0,
360 };
361 let sel = select_stages(&p);
362 assert!(!sel.enabled_stages.contains(&"dict_compress".to_string()));
364 }
365
366 #[test]
367 fn test_net_benefit_calculation() {
368 let c = StageCandidate {
369 name: "test".to_string(),
370 overhead_tokens: 5,
371 savings_tokens: 20,
372 applicable: true,
373 };
374 assert_eq!(c.net_benefit(), 15);
375
376 let c2 = StageCandidate {
377 name: "test".to_string(),
378 overhead_tokens: 20,
379 savings_tokens: 5,
380 applicable: true,
381 };
382 assert_eq!(c2.net_benefit(), -15);
383 }
384
385 #[test]
386 fn test_repetitive_log_profile() {
387 let p = profile_content("[INFO] ok\n[INFO] ok\n[INFO] ok\n[ERROR] fail\n");
388 assert!(p.is_log);
389 assert!(p.has_repetition);
390 }
391}