1use std::collections::HashMap;
2use std::path::Path;
3
4use super::entropy::kolmogorov_proxy;
5
6#[derive(Debug, Clone)]
7pub struct CompressionThresholds {
8 pub bpe_entropy: f64,
9 pub jaccard: f64,
10 pub auto_delta: f64,
11}
12
13impl Default for CompressionThresholds {
14 fn default() -> Self {
15 Self {
16 bpe_entropy: 1.0,
17 jaccard: 0.7,
18 auto_delta: 0.6,
19 }
20 }
21}
22
23static LANGUAGE_THRESHOLDS: &[(&str, CompressionThresholds)] = &[
24 (
26 "py",
27 CompressionThresholds {
28 bpe_entropy: 1.2,
29 jaccard: 0.65,
30 auto_delta: 0.55,
31 },
32 ),
33 (
35 "rs",
36 CompressionThresholds {
37 bpe_entropy: 0.85,
38 jaccard: 0.72,
39 auto_delta: 0.6,
40 },
41 ),
42 (
44 "ts",
45 CompressionThresholds {
46 bpe_entropy: 0.95,
47 jaccard: 0.68,
48 auto_delta: 0.58,
49 },
50 ),
51 (
52 "tsx",
53 CompressionThresholds {
54 bpe_entropy: 0.95,
55 jaccard: 0.68,
56 auto_delta: 0.58,
57 },
58 ),
59 (
60 "js",
61 CompressionThresholds {
62 bpe_entropy: 1.0,
63 jaccard: 0.68,
64 auto_delta: 0.58,
65 },
66 ),
67 (
68 "jsx",
69 CompressionThresholds {
70 bpe_entropy: 1.0,
71 jaccard: 0.68,
72 auto_delta: 0.58,
73 },
74 ),
75 (
77 "go",
78 CompressionThresholds {
79 bpe_entropy: 0.9,
80 jaccard: 0.72,
81 auto_delta: 0.55,
82 },
83 ),
84 (
86 "java",
87 CompressionThresholds {
88 bpe_entropy: 0.8,
89 jaccard: 0.65,
90 auto_delta: 0.5,
91 },
92 ),
93 (
94 "kt",
95 CompressionThresholds {
96 bpe_entropy: 0.85,
97 jaccard: 0.68,
98 auto_delta: 0.55,
99 },
100 ),
101 (
103 "c",
104 CompressionThresholds {
105 bpe_entropy: 0.9,
106 jaccard: 0.7,
107 auto_delta: 0.6,
108 },
109 ),
110 (
111 "h",
112 CompressionThresholds {
113 bpe_entropy: 0.75,
114 jaccard: 0.65,
115 auto_delta: 0.5,
116 },
117 ),
118 (
119 "cpp",
120 CompressionThresholds {
121 bpe_entropy: 0.9,
122 jaccard: 0.7,
123 auto_delta: 0.6,
124 },
125 ),
126 (
127 "hpp",
128 CompressionThresholds {
129 bpe_entropy: 0.75,
130 jaccard: 0.65,
131 auto_delta: 0.5,
132 },
133 ),
134 (
136 "rb",
137 CompressionThresholds {
138 bpe_entropy: 1.15,
139 jaccard: 0.65,
140 auto_delta: 0.55,
141 },
142 ),
143 (
145 "json",
146 CompressionThresholds {
147 bpe_entropy: 0.6,
148 jaccard: 0.6,
149 auto_delta: 0.4,
150 },
151 ),
152 (
153 "yaml",
154 CompressionThresholds {
155 bpe_entropy: 0.7,
156 jaccard: 0.62,
157 auto_delta: 0.45,
158 },
159 ),
160 (
161 "yml",
162 CompressionThresholds {
163 bpe_entropy: 0.7,
164 jaccard: 0.62,
165 auto_delta: 0.45,
166 },
167 ),
168 (
169 "toml",
170 CompressionThresholds {
171 bpe_entropy: 0.7,
172 jaccard: 0.62,
173 auto_delta: 0.45,
174 },
175 ),
176 (
177 "xml",
178 CompressionThresholds {
179 bpe_entropy: 0.6,
180 jaccard: 0.6,
181 auto_delta: 0.4,
182 },
183 ),
184 (
186 "md",
187 CompressionThresholds {
188 bpe_entropy: 1.3,
189 jaccard: 0.6,
190 auto_delta: 0.55,
191 },
192 ),
193 (
195 "css",
196 CompressionThresholds {
197 bpe_entropy: 0.7,
198 jaccard: 0.6,
199 auto_delta: 0.45,
200 },
201 ),
202 (
203 "scss",
204 CompressionThresholds {
205 bpe_entropy: 0.75,
206 jaccard: 0.62,
207 auto_delta: 0.48,
208 },
209 ),
210 (
212 "sql",
213 CompressionThresholds {
214 bpe_entropy: 0.8,
215 jaccard: 0.65,
216 auto_delta: 0.5,
217 },
218 ),
219 (
221 "sh",
222 CompressionThresholds {
223 bpe_entropy: 1.0,
224 jaccard: 0.68,
225 auto_delta: 0.55,
226 },
227 ),
228 (
229 "bash",
230 CompressionThresholds {
231 bpe_entropy: 1.0,
232 jaccard: 0.68,
233 auto_delta: 0.55,
234 },
235 ),
236 (
238 "swift",
239 CompressionThresholds {
240 bpe_entropy: 0.9,
241 jaccard: 0.68,
242 auto_delta: 0.55,
243 },
244 ),
245 (
246 "cs",
247 CompressionThresholds {
248 bpe_entropy: 0.85,
249 jaccard: 0.65,
250 auto_delta: 0.52,
251 },
252 ),
253 (
255 "php",
256 CompressionThresholds {
257 bpe_entropy: 0.95,
258 jaccard: 0.68,
259 auto_delta: 0.55,
260 },
261 ),
262];
263
264fn language_map() -> HashMap<&'static str, &'static CompressionThresholds> {
265 LANGUAGE_THRESHOLDS
266 .iter()
267 .map(|(ext, t)| (*ext, t))
268 .collect()
269}
270
271pub fn thresholds_for_path(path: &str) -> CompressionThresholds {
272 let ext = Path::new(path)
273 .extension()
274 .and_then(|e| e.to_str())
275 .unwrap_or("");
276
277 let map = language_map();
278 if let Some(t) = map.get(ext) {
279 return (*t).clone();
280 }
281
282 CompressionThresholds::default()
283}
284
285pub fn adaptive_thresholds(path: &str, content: &str) -> CompressionThresholds {
286 let mut base = thresholds_for_path(path);
287
288 let ext = std::path::Path::new(path)
289 .extension()
290 .and_then(|e| e.to_str())
291 .unwrap_or("");
292 let feedback = super::feedback::FeedbackStore::load();
293 if let Some(learned_entropy) = feedback.get_learned_entropy(ext) {
294 base.bpe_entropy = base.bpe_entropy * 0.6 + learned_entropy * 0.4;
295 }
296 if let Some(learned_jaccard) = feedback.get_learned_jaccard(ext) {
297 base.jaccard = base.jaccard * 0.6 + learned_jaccard * 0.4;
298 }
299
300 if content.len() > 500 {
301 let k = kolmogorov_proxy(content);
302 let k_adjustment = (k - 0.45) * 0.5;
303 base.bpe_entropy = (base.bpe_entropy + k_adjustment).clamp(0.4, 2.0);
304 base.jaccard = (base.jaccard - k_adjustment * 0.3).clamp(0.5, 0.85);
305 }
306
307 if let Some(project_root) =
308 crate::core::session::SessionState::load_latest().and_then(|s| s.project_root)
309 {
310 let bandit_key = format!("{ext}_{}", token_bucket_label(content));
311 let mut store = super::bandit::BanditStore::load(&project_root);
312 let bandit = store.get_or_create(&bandit_key);
313 let arm = bandit.select_arm();
314 base.bpe_entropy = base.bpe_entropy * 0.5 + arm.entropy_threshold * 0.5;
315 base.jaccard = base.jaccard * 0.5 + arm.jaccard_threshold * 0.5;
316 LAST_BANDIT_ARM
317 .lock()
318 .unwrap_or_else(|e| e.into_inner())
319 .replace((project_root, bandit_key, arm.name.clone()));
320 }
321
322 base
323}
324
325pub fn report_bandit_outcome(success: bool) {
326 let data = LAST_BANDIT_ARM
327 .lock()
328 .unwrap_or_else(|e| e.into_inner())
329 .take();
330 if let Some((project_root, bandit_key, arm_name)) = data {
331 let mut store = super::bandit::BanditStore::load(&project_root);
332 store.get_or_create(&bandit_key).update(&arm_name, success);
333 let _ = store.save(&project_root);
334 }
335}
336
337static LAST_BANDIT_ARM: std::sync::Mutex<Option<(String, String, String)>> =
338 std::sync::Mutex::new(None);
339
340fn token_bucket_label(content: &str) -> &'static str {
341 let len = content.len();
342 match len {
343 0..=2000 => "sm",
344 2001..=10000 => "md",
345 10001..=50000 => "lg",
346 _ => "xl",
347 }
348}
349
350#[cfg(test)]
351mod tests {
352 use super::*;
353
354 #[test]
355 fn rust_has_lower_threshold_than_python() {
356 let rs = thresholds_for_path("src/main.rs");
357 let py = thresholds_for_path("src/main.py");
358 assert!(rs.bpe_entropy < py.bpe_entropy);
359 }
360
361 #[test]
362 fn json_has_lowest_threshold() {
363 let json = thresholds_for_path("config.json");
364 let rs = thresholds_for_path("main.rs");
365 assert!(json.bpe_entropy < rs.bpe_entropy);
366 }
367
368 #[test]
369 fn unknown_ext_uses_default() {
370 let t = thresholds_for_path("file.xyz");
371 assert!((t.bpe_entropy - 1.0).abs() < f64::EPSILON);
372 }
373
374 #[test]
375 fn adaptive_adjusts_for_compressibility() {
376 let repetitive = "use std::io;\n".repeat(200);
377 let diverse: String = (0..200)
378 .map(|i| format!("let var_{i} = compute_{i}(arg_{i});\n"))
379 .collect();
380
381 let base_rep = thresholds_for_path("main.rs");
382 let base_div = thresholds_for_path("main.rs");
383 assert!(
384 (base_rep.bpe_entropy - base_div.bpe_entropy).abs() < f64::EPSILON,
385 "same path should get same base thresholds"
386 );
387
388 let k_rep = kolmogorov_proxy(&repetitive);
389 let k_div = kolmogorov_proxy(&diverse);
390 assert!(
391 k_rep < k_div,
392 "repetitive content should have lower Kolmogorov proxy: {} vs {}",
393 k_rep,
394 k_div
395 );
396 }
397}