Skip to main content

lean_ctx/core/
adaptive_thresholds.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use super::entropy::kolmogorov_proxy;
5
6#[derive(Debug, Clone)]
7pub struct CompressionThresholds {
8    pub bpe_entropy: f64,
9    pub jaccard: f64,
10    pub auto_delta: f64,
11}
12
13impl Default for CompressionThresholds {
14    fn default() -> Self {
15        Self {
16            bpe_entropy: 1.0,
17            jaccard: 0.7,
18            auto_delta: 0.6,
19        }
20    }
21}
22
23static LANGUAGE_THRESHOLDS: &[(&str, CompressionThresholds)] = &[
24    // Python: English-like syntax, significant whitespace → higher entropy baseline
25    (
26        "py",
27        CompressionThresholds {
28            bpe_entropy: 1.2,
29            jaccard: 0.65,
30            auto_delta: 0.55,
31        },
32    ),
33    // Rust: Repetitive keywords (fn, pub, impl, let, mut) → lower threshold catches more
34    (
35        "rs",
36        CompressionThresholds {
37            bpe_entropy: 0.85,
38            jaccard: 0.72,
39            auto_delta: 0.6,
40        },
41    ),
42    // TypeScript/JavaScript: Type annotations are predictable
43    (
44        "ts",
45        CompressionThresholds {
46            bpe_entropy: 0.95,
47            jaccard: 0.68,
48            auto_delta: 0.58,
49        },
50    ),
51    (
52        "tsx",
53        CompressionThresholds {
54            bpe_entropy: 0.95,
55            jaccard: 0.68,
56            auto_delta: 0.58,
57        },
58    ),
59    (
60        "js",
61        CompressionThresholds {
62            bpe_entropy: 1.0,
63            jaccard: 0.68,
64            auto_delta: 0.58,
65        },
66    ),
67    (
68        "jsx",
69        CompressionThresholds {
70            bpe_entropy: 1.0,
71            jaccard: 0.68,
72            auto_delta: 0.58,
73        },
74    ),
75    // Go: Verbose but highly structured → aggressive threshold
76    (
77        "go",
78        CompressionThresholds {
79            bpe_entropy: 0.9,
80            jaccard: 0.72,
81            auto_delta: 0.55,
82        },
83    ),
84    // Java/Kotlin: Very verbose, lots of boilerplate
85    (
86        "java",
87        CompressionThresholds {
88            bpe_entropy: 0.8,
89            jaccard: 0.65,
90            auto_delta: 0.5,
91        },
92    ),
93    (
94        "kt",
95        CompressionThresholds {
96            bpe_entropy: 0.85,
97            jaccard: 0.68,
98            auto_delta: 0.55,
99        },
100    ),
101    // C/C++: Headers are highly repetitive
102    (
103        "c",
104        CompressionThresholds {
105            bpe_entropy: 0.9,
106            jaccard: 0.7,
107            auto_delta: 0.6,
108        },
109    ),
110    (
111        "h",
112        CompressionThresholds {
113            bpe_entropy: 0.75,
114            jaccard: 0.65,
115            auto_delta: 0.5,
116        },
117    ),
118    (
119        "cpp",
120        CompressionThresholds {
121            bpe_entropy: 0.9,
122            jaccard: 0.7,
123            auto_delta: 0.6,
124        },
125    ),
126    (
127        "hpp",
128        CompressionThresholds {
129            bpe_entropy: 0.75,
130            jaccard: 0.65,
131            auto_delta: 0.5,
132        },
133    ),
134    // Ruby: English-like, high entropy
135    (
136        "rb",
137        CompressionThresholds {
138            bpe_entropy: 1.15,
139            jaccard: 0.65,
140            auto_delta: 0.55,
141        },
142    ),
143    // Config/data files: highly repetitive
144    (
145        "json",
146        CompressionThresholds {
147            bpe_entropy: 0.6,
148            jaccard: 0.6,
149            auto_delta: 0.4,
150        },
151    ),
152    (
153        "yaml",
154        CompressionThresholds {
155            bpe_entropy: 0.7,
156            jaccard: 0.62,
157            auto_delta: 0.45,
158        },
159    ),
160    (
161        "yml",
162        CompressionThresholds {
163            bpe_entropy: 0.7,
164            jaccard: 0.62,
165            auto_delta: 0.45,
166        },
167    ),
168    (
169        "toml",
170        CompressionThresholds {
171            bpe_entropy: 0.7,
172            jaccard: 0.62,
173            auto_delta: 0.45,
174        },
175    ),
176    (
177        "xml",
178        CompressionThresholds {
179            bpe_entropy: 0.6,
180            jaccard: 0.6,
181            auto_delta: 0.4,
182        },
183    ),
184    // Markdown/docs: natural language, high entropy
185    (
186        "md",
187        CompressionThresholds {
188            bpe_entropy: 1.3,
189            jaccard: 0.6,
190            auto_delta: 0.55,
191        },
192    ),
193    // CSS: very repetitive selectors/properties
194    (
195        "css",
196        CompressionThresholds {
197            bpe_entropy: 0.7,
198            jaccard: 0.6,
199            auto_delta: 0.45,
200        },
201    ),
202    (
203        "scss",
204        CompressionThresholds {
205            bpe_entropy: 0.75,
206            jaccard: 0.62,
207            auto_delta: 0.48,
208        },
209    ),
210    // SQL: repetitive keywords
211    (
212        "sql",
213        CompressionThresholds {
214            bpe_entropy: 0.8,
215            jaccard: 0.65,
216            auto_delta: 0.5,
217        },
218    ),
219    // Shell scripts
220    (
221        "sh",
222        CompressionThresholds {
223            bpe_entropy: 1.0,
224            jaccard: 0.68,
225            auto_delta: 0.55,
226        },
227    ),
228    (
229        "bash",
230        CompressionThresholds {
231            bpe_entropy: 1.0,
232            jaccard: 0.68,
233            auto_delta: 0.55,
234        },
235    ),
236    // Swift/C#
237    (
238        "swift",
239        CompressionThresholds {
240            bpe_entropy: 0.9,
241            jaccard: 0.68,
242            auto_delta: 0.55,
243        },
244    ),
245    (
246        "cs",
247        CompressionThresholds {
248            bpe_entropy: 0.85,
249            jaccard: 0.65,
250            auto_delta: 0.52,
251        },
252    ),
253    // PHP
254    (
255        "php",
256        CompressionThresholds {
257            bpe_entropy: 0.95,
258            jaccard: 0.68,
259            auto_delta: 0.55,
260        },
261    ),
262];
263
264fn language_map() -> HashMap<&'static str, &'static CompressionThresholds> {
265    LANGUAGE_THRESHOLDS
266        .iter()
267        .map(|(ext, t)| (*ext, t))
268        .collect()
269}
270
271pub fn thresholds_for_path(path: &str) -> CompressionThresholds {
272    let ext = Path::new(path)
273        .extension()
274        .and_then(|e| e.to_str())
275        .unwrap_or("");
276
277    let map = language_map();
278    if let Some(t) = map.get(ext) {
279        return (*t).clone();
280    }
281
282    CompressionThresholds::default()
283}
284
285pub fn adaptive_thresholds(path: &str, content: &str) -> CompressionThresholds {
286    let mut base = thresholds_for_path(path);
287
288    let ext = std::path::Path::new(path)
289        .extension()
290        .and_then(|e| e.to_str())
291        .unwrap_or("");
292    let feedback = super::feedback::FeedbackStore::load();
293    if let Some(learned_entropy) = feedback.get_learned_entropy(ext) {
294        base.bpe_entropy = base.bpe_entropy * 0.6 + learned_entropy * 0.4;
295    }
296    if let Some(learned_jaccard) = feedback.get_learned_jaccard(ext) {
297        base.jaccard = base.jaccard * 0.6 + learned_jaccard * 0.4;
298    }
299
300    if content.len() > 500 {
301        let k = kolmogorov_proxy(content);
302        let k_adjustment = (k - 0.45) * 0.5;
303        base.bpe_entropy = (base.bpe_entropy + k_adjustment).clamp(0.4, 2.0);
304        base.jaccard = (base.jaccard - k_adjustment * 0.3).clamp(0.5, 0.85);
305    }
306
307    if let Some(project_root) =
308        crate::core::session::SessionState::load_latest().and_then(|s| s.project_root)
309    {
310        let bandit_key = format!("{ext}_{}", token_bucket_label(content));
311        let mut store = super::bandit::BanditStore::load(&project_root);
312        let bandit = store.get_or_create(&bandit_key);
313        let arm = bandit.select_arm();
314        base.bpe_entropy = base.bpe_entropy * 0.5 + arm.entropy_threshold * 0.5;
315        base.jaccard = base.jaccard * 0.5 + arm.jaccard_threshold * 0.5;
316        LAST_BANDIT_ARM
317            .lock()
318            .unwrap_or_else(|e| e.into_inner())
319            .replace((project_root, bandit_key, arm.name.clone()));
320    }
321
322    base
323}
324
325pub fn report_bandit_outcome(success: bool) {
326    let data = LAST_BANDIT_ARM
327        .lock()
328        .unwrap_or_else(|e| e.into_inner())
329        .take();
330    if let Some((project_root, bandit_key, arm_name)) = data {
331        let mut store = super::bandit::BanditStore::load(&project_root);
332        store.get_or_create(&bandit_key).update(&arm_name, success);
333        let _ = store.save(&project_root);
334    }
335}
336
337static LAST_BANDIT_ARM: std::sync::Mutex<Option<(String, String, String)>> =
338    std::sync::Mutex::new(None);
339
340fn token_bucket_label(content: &str) -> &'static str {
341    let len = content.len();
342    match len {
343        0..=2000 => "sm",
344        2001..=10000 => "md",
345        10001..=50000 => "lg",
346        _ => "xl",
347    }
348}
349
350#[cfg(test)]
351mod tests {
352    use super::*;
353
354    #[test]
355    fn rust_has_lower_threshold_than_python() {
356        let rs = thresholds_for_path("src/main.rs");
357        let py = thresholds_for_path("src/main.py");
358        assert!(rs.bpe_entropy < py.bpe_entropy);
359    }
360
361    #[test]
362    fn json_has_lowest_threshold() {
363        let json = thresholds_for_path("config.json");
364        let rs = thresholds_for_path("main.rs");
365        assert!(json.bpe_entropy < rs.bpe_entropy);
366    }
367
368    #[test]
369    fn unknown_ext_uses_default() {
370        let t = thresholds_for_path("file.xyz");
371        assert!((t.bpe_entropy - 1.0).abs() < f64::EPSILON);
372    }
373
374    #[test]
375    fn adaptive_adjusts_for_compressibility() {
376        let repetitive = "use std::io;\n".repeat(200);
377        let diverse: String = (0..200)
378            .map(|i| format!("let var_{i} = compute_{i}(arg_{i});\n"))
379            .collect();
380
381        let base_rep = thresholds_for_path("main.rs");
382        let base_div = thresholds_for_path("main.rs");
383        assert!(
384            (base_rep.bpe_entropy - base_div.bpe_entropy).abs() < f64::EPSILON,
385            "same path should get same base thresholds"
386        );
387
388        let k_rep = kolmogorov_proxy(&repetitive);
389        let k_div = kolmogorov_proxy(&diverse);
390        assert!(
391            k_rep < k_div,
392            "repetitive content should have lower Kolmogorov proxy: {} vs {}",
393            k_rep,
394            k_div
395        );
396    }
397}