Skip to main content

lean_ctx/core/
adaptive_thresholds.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use super::entropy::kolmogorov_proxy;
5
6#[derive(Debug, Clone)]
7pub struct CompressionThresholds {
8    pub bpe_entropy: f64,
9    pub jaccard: f64,
10    pub auto_delta: f64,
11}
12
13impl Default for CompressionThresholds {
14    fn default() -> Self {
15        Self {
16            bpe_entropy: 1.0,
17            jaccard: 0.7,
18            auto_delta: 0.6,
19        }
20    }
21}
22
23static LANGUAGE_THRESHOLDS: &[(&str, CompressionThresholds)] = &[
24    // Python: English-like syntax, significant whitespace → higher entropy baseline
25    (
26        "py",
27        CompressionThresholds {
28            bpe_entropy: 1.2,
29            jaccard: 0.65,
30            auto_delta: 0.55,
31        },
32    ),
33    // Rust: Repetitive keywords (fn, pub, impl, let, mut) → lower threshold catches more
34    // Tuning result: jaccard 0.55 gives 9% savings with 100% quality (was 0.72 = 6.4%)
35    (
36        "rs",
37        CompressionThresholds {
38            bpe_entropy: 0.85,
39            jaccard: 0.55,
40            auto_delta: 0.6,
41        },
42    ),
43    // TypeScript/JavaScript: Type annotations are predictable
44    (
45        "ts",
46        CompressionThresholds {
47            bpe_entropy: 0.95,
48            jaccard: 0.68,
49            auto_delta: 0.58,
50        },
51    ),
52    (
53        "tsx",
54        CompressionThresholds {
55            bpe_entropy: 0.95,
56            jaccard: 0.68,
57            auto_delta: 0.58,
58        },
59    ),
60    (
61        "js",
62        CompressionThresholds {
63            bpe_entropy: 1.0,
64            jaccard: 0.68,
65            auto_delta: 0.58,
66        },
67    ),
68    (
69        "jsx",
70        CompressionThresholds {
71            bpe_entropy: 1.0,
72            jaccard: 0.68,
73            auto_delta: 0.58,
74        },
75    ),
76    // Go: Verbose but highly structured → aggressive threshold
77    // Tuning: lowered jaccard for better dedup on verbose Go boilerplate
78    (
79        "go",
80        CompressionThresholds {
81            bpe_entropy: 0.9,
82            jaccard: 0.55,
83            auto_delta: 0.55,
84        },
85    ),
86    // Java/Kotlin: Very verbose, lots of boilerplate
87    // Tuning: aggressive jaccard for Java's extreme boilerplate repetition
88    (
89        "java",
90        CompressionThresholds {
91            bpe_entropy: 0.8,
92            jaccard: 0.48,
93            auto_delta: 0.5,
94        },
95    ),
96    (
97        "kt",
98        CompressionThresholds {
99            bpe_entropy: 0.85,
100            jaccard: 0.68,
101            auto_delta: 0.55,
102        },
103    ),
104    // C/C++: Headers are highly repetitive
105    (
106        "c",
107        CompressionThresholds {
108            bpe_entropy: 0.9,
109            jaccard: 0.7,
110            auto_delta: 0.6,
111        },
112    ),
113    (
114        "h",
115        CompressionThresholds {
116            bpe_entropy: 0.75,
117            jaccard: 0.65,
118            auto_delta: 0.5,
119        },
120    ),
121    (
122        "cpp",
123        CompressionThresholds {
124            bpe_entropy: 0.9,
125            jaccard: 0.7,
126            auto_delta: 0.6,
127        },
128    ),
129    (
130        "hpp",
131        CompressionThresholds {
132            bpe_entropy: 0.75,
133            jaccard: 0.65,
134            auto_delta: 0.5,
135        },
136    ),
137    // Ruby: English-like, high entropy
138    (
139        "rb",
140        CompressionThresholds {
141            bpe_entropy: 1.15,
142            jaccard: 0.65,
143            auto_delta: 0.55,
144        },
145    ),
146    // Config/data files: highly repetitive
147    (
148        "json",
149        CompressionThresholds {
150            bpe_entropy: 0.6,
151            jaccard: 0.6,
152            auto_delta: 0.4,
153        },
154    ),
155    (
156        "yaml",
157        CompressionThresholds {
158            bpe_entropy: 0.7,
159            jaccard: 0.62,
160            auto_delta: 0.45,
161        },
162    ),
163    (
164        "yml",
165        CompressionThresholds {
166            bpe_entropy: 0.7,
167            jaccard: 0.62,
168            auto_delta: 0.45,
169        },
170    ),
171    (
172        "toml",
173        CompressionThresholds {
174            bpe_entropy: 0.7,
175            jaccard: 0.62,
176            auto_delta: 0.45,
177        },
178    ),
179    (
180        "xml",
181        CompressionThresholds {
182            bpe_entropy: 0.6,
183            jaccard: 0.6,
184            auto_delta: 0.4,
185        },
186    ),
187    // Markdown/docs: natural language, high entropy
188    (
189        "md",
190        CompressionThresholds {
191            bpe_entropy: 1.3,
192            jaccard: 0.6,
193            auto_delta: 0.55,
194        },
195    ),
196    // CSS: very repetitive selectors/properties
197    (
198        "css",
199        CompressionThresholds {
200            bpe_entropy: 0.7,
201            jaccard: 0.6,
202            auto_delta: 0.45,
203        },
204    ),
205    (
206        "scss",
207        CompressionThresholds {
208            bpe_entropy: 0.75,
209            jaccard: 0.62,
210            auto_delta: 0.48,
211        },
212    ),
213    // SQL: repetitive keywords
214    (
215        "sql",
216        CompressionThresholds {
217            bpe_entropy: 0.8,
218            jaccard: 0.65,
219            auto_delta: 0.5,
220        },
221    ),
222    // Shell scripts
223    (
224        "sh",
225        CompressionThresholds {
226            bpe_entropy: 1.0,
227            jaccard: 0.68,
228            auto_delta: 0.55,
229        },
230    ),
231    (
232        "bash",
233        CompressionThresholds {
234            bpe_entropy: 1.0,
235            jaccard: 0.68,
236            auto_delta: 0.55,
237        },
238    ),
239    // Swift/C#
240    (
241        "swift",
242        CompressionThresholds {
243            bpe_entropy: 0.9,
244            jaccard: 0.68,
245            auto_delta: 0.55,
246        },
247    ),
248    (
249        "cs",
250        CompressionThresholds {
251            bpe_entropy: 0.85,
252            jaccard: 0.65,
253            auto_delta: 0.52,
254        },
255    ),
256    // PHP
257    (
258        "php",
259        CompressionThresholds {
260            bpe_entropy: 0.95,
261            jaccard: 0.68,
262            auto_delta: 0.55,
263        },
264    ),
265];
266
267fn language_map() -> HashMap<&'static str, &'static CompressionThresholds> {
268    LANGUAGE_THRESHOLDS
269        .iter()
270        .map(|(ext, t)| (*ext, t))
271        .collect()
272}
273
274pub fn thresholds_for_path(path: &str) -> CompressionThresholds {
275    let ext = Path::new(path)
276        .extension()
277        .and_then(|e| e.to_str())
278        .unwrap_or("");
279
280    let map = language_map();
281    if let Some(t) = map.get(ext) {
282        return (*t).clone();
283    }
284
285    CompressionThresholds::default()
286}
287
288pub fn adaptive_thresholds(path: &str, content: &str) -> CompressionThresholds {
289    let mut base = thresholds_for_path(path);
290
291    // Apply learned thresholds from feedback loop if available
292    let ext = std::path::Path::new(path)
293        .extension()
294        .and_then(|e| e.to_str())
295        .unwrap_or("");
296    let feedback = super::feedback::FeedbackStore::load();
297    if let Some(learned_entropy) = feedback.get_learned_entropy(ext) {
298        base.bpe_entropy = base.bpe_entropy * 0.6 + learned_entropy * 0.4;
299    }
300    if let Some(learned_jaccard) = feedback.get_learned_jaccard(ext) {
301        base.jaccard = base.jaccard * 0.6 + learned_jaccard * 0.4;
302    }
303
304    if content.len() > 500 {
305        let k = kolmogorov_proxy(content);
306        let k_adjustment = (k - 0.45) * 0.5;
307        base.bpe_entropy = (base.bpe_entropy + k_adjustment).clamp(0.4, 2.0);
308        base.jaccard = (base.jaccard - k_adjustment * 0.3).clamp(0.5, 0.85);
309    }
310
311    base
312}
313
314#[cfg(test)]
315mod tests {
316    use super::*;
317
318    #[test]
319    fn rust_has_lower_threshold_than_python() {
320        let rs = thresholds_for_path("src/main.rs");
321        let py = thresholds_for_path("src/main.py");
322        assert!(rs.bpe_entropy < py.bpe_entropy);
323    }
324
325    #[test]
326    fn json_has_lowest_threshold() {
327        let json = thresholds_for_path("config.json");
328        let rs = thresholds_for_path("main.rs");
329        assert!(json.bpe_entropy < rs.bpe_entropy);
330    }
331
332    #[test]
333    fn unknown_ext_uses_default() {
334        let t = thresholds_for_path("file.xyz");
335        assert!((t.bpe_entropy - 1.0).abs() < f64::EPSILON);
336    }
337
338    #[test]
339    fn adaptive_adjusts_for_compressibility() {
340        let repetitive = "use std::io;\n".repeat(200);
341        let diverse: String = (0..200)
342            .map(|i| format!("let var_{i} = compute_{i}(arg_{i});\n"))
343            .collect();
344
345        let t_rep = adaptive_thresholds("main.rs", &repetitive);
346        let t_div = adaptive_thresholds("main.rs", &diverse);
347        assert!(
348            t_rep.bpe_entropy < t_div.bpe_entropy,
349            "repetitive content should get lower threshold: {} vs {}",
350            t_rep.bpe_entropy,
351            t_div.bpe_entropy
352        );
353    }
354}