Skip to main content

lean_ctx/core/
adaptive_thresholds.rs

1use std::collections::HashMap;
2use std::path::Path;
3
4use super::entropy::kolmogorov_proxy;
5
6#[derive(Debug, Clone)]
7pub struct CompressionThresholds {
8    pub bpe_entropy: f64,
9    pub jaccard: f64,
10    pub auto_delta: f64,
11}
12
13impl Default for CompressionThresholds {
14    fn default() -> Self {
15        Self {
16            bpe_entropy: 1.0,
17            jaccard: 0.7,
18            auto_delta: 0.6,
19        }
20    }
21}
22
23static LANGUAGE_THRESHOLDS: &[(&str, CompressionThresholds)] = &[
24    // Python: English-like syntax, significant whitespace → higher entropy baseline
25    (
26        "py",
27        CompressionThresholds {
28            bpe_entropy: 1.2,
29            jaccard: 0.65,
30            auto_delta: 0.55,
31        },
32    ),
33    // Rust: Repetitive keywords (fn, pub, impl, let, mut) → lower threshold catches more
34    (
35        "rs",
36        CompressionThresholds {
37            bpe_entropy: 0.85,
38            jaccard: 0.72,
39            auto_delta: 0.6,
40        },
41    ),
42    // TypeScript/JavaScript: Type annotations are predictable
43    (
44        "ts",
45        CompressionThresholds {
46            bpe_entropy: 0.95,
47            jaccard: 0.68,
48            auto_delta: 0.58,
49        },
50    ),
51    (
52        "tsx",
53        CompressionThresholds {
54            bpe_entropy: 0.95,
55            jaccard: 0.68,
56            auto_delta: 0.58,
57        },
58    ),
59    (
60        "js",
61        CompressionThresholds {
62            bpe_entropy: 1.0,
63            jaccard: 0.68,
64            auto_delta: 0.58,
65        },
66    ),
67    (
68        "jsx",
69        CompressionThresholds {
70            bpe_entropy: 1.0,
71            jaccard: 0.68,
72            auto_delta: 0.58,
73        },
74    ),
75    // Go: Verbose but highly structured → aggressive threshold
76    (
77        "go",
78        CompressionThresholds {
79            bpe_entropy: 0.9,
80            jaccard: 0.72,
81            auto_delta: 0.55,
82        },
83    ),
84    // Java/Kotlin: Very verbose, lots of boilerplate
85    (
86        "java",
87        CompressionThresholds {
88            bpe_entropy: 0.8,
89            jaccard: 0.65,
90            auto_delta: 0.5,
91        },
92    ),
93    (
94        "kt",
95        CompressionThresholds {
96            bpe_entropy: 0.85,
97            jaccard: 0.68,
98            auto_delta: 0.55,
99        },
100    ),
101    // C/C++: Headers are highly repetitive
102    (
103        "c",
104        CompressionThresholds {
105            bpe_entropy: 0.9,
106            jaccard: 0.7,
107            auto_delta: 0.6,
108        },
109    ),
110    (
111        "h",
112        CompressionThresholds {
113            bpe_entropy: 0.75,
114            jaccard: 0.65,
115            auto_delta: 0.5,
116        },
117    ),
118    (
119        "cpp",
120        CompressionThresholds {
121            bpe_entropy: 0.9,
122            jaccard: 0.7,
123            auto_delta: 0.6,
124        },
125    ),
126    (
127        "hpp",
128        CompressionThresholds {
129            bpe_entropy: 0.75,
130            jaccard: 0.65,
131            auto_delta: 0.5,
132        },
133    ),
134    // Ruby: English-like, high entropy
135    (
136        "rb",
137        CompressionThresholds {
138            bpe_entropy: 1.15,
139            jaccard: 0.65,
140            auto_delta: 0.55,
141        },
142    ),
143    // Config/data files: highly repetitive
144    (
145        "json",
146        CompressionThresholds {
147            bpe_entropy: 0.6,
148            jaccard: 0.6,
149            auto_delta: 0.4,
150        },
151    ),
152    (
153        "yaml",
154        CompressionThresholds {
155            bpe_entropy: 0.7,
156            jaccard: 0.62,
157            auto_delta: 0.45,
158        },
159    ),
160    (
161        "yml",
162        CompressionThresholds {
163            bpe_entropy: 0.7,
164            jaccard: 0.62,
165            auto_delta: 0.45,
166        },
167    ),
168    (
169        "toml",
170        CompressionThresholds {
171            bpe_entropy: 0.7,
172            jaccard: 0.62,
173            auto_delta: 0.45,
174        },
175    ),
176    (
177        "xml",
178        CompressionThresholds {
179            bpe_entropy: 0.6,
180            jaccard: 0.6,
181            auto_delta: 0.4,
182        },
183    ),
184    // Markdown/docs: natural language, high entropy
185    (
186        "md",
187        CompressionThresholds {
188            bpe_entropy: 1.3,
189            jaccard: 0.6,
190            auto_delta: 0.55,
191        },
192    ),
193    // CSS: very repetitive selectors/properties
194    (
195        "css",
196        CompressionThresholds {
197            bpe_entropy: 0.7,
198            jaccard: 0.6,
199            auto_delta: 0.45,
200        },
201    ),
202    (
203        "scss",
204        CompressionThresholds {
205            bpe_entropy: 0.75,
206            jaccard: 0.62,
207            auto_delta: 0.48,
208        },
209    ),
210    // SQL: repetitive keywords
211    (
212        "sql",
213        CompressionThresholds {
214            bpe_entropy: 0.8,
215            jaccard: 0.65,
216            auto_delta: 0.5,
217        },
218    ),
219    // Shell scripts
220    (
221        "sh",
222        CompressionThresholds {
223            bpe_entropy: 1.0,
224            jaccard: 0.68,
225            auto_delta: 0.55,
226        },
227    ),
228    (
229        "bash",
230        CompressionThresholds {
231            bpe_entropy: 1.0,
232            jaccard: 0.68,
233            auto_delta: 0.55,
234        },
235    ),
236    // Swift/C#
237    (
238        "swift",
239        CompressionThresholds {
240            bpe_entropy: 0.9,
241            jaccard: 0.68,
242            auto_delta: 0.55,
243        },
244    ),
245    (
246        "cs",
247        CompressionThresholds {
248            bpe_entropy: 0.85,
249            jaccard: 0.65,
250            auto_delta: 0.52,
251        },
252    ),
253    // PHP
254    (
255        "php",
256        CompressionThresholds {
257            bpe_entropy: 0.95,
258            jaccard: 0.68,
259            auto_delta: 0.55,
260        },
261    ),
262];
263
264fn language_map() -> HashMap<&'static str, &'static CompressionThresholds> {
265    LANGUAGE_THRESHOLDS
266        .iter()
267        .map(|(ext, t)| (*ext, t))
268        .collect()
269}
270
271pub fn thresholds_for_path(path: &str) -> CompressionThresholds {
272    let ext = Path::new(path)
273        .extension()
274        .and_then(|e| e.to_str())
275        .unwrap_or("");
276
277    let map = language_map();
278    if let Some(t) = map.get(ext) {
279        return (*t).clone();
280    }
281
282    CompressionThresholds::default()
283}
284
285pub fn adaptive_thresholds(path: &str, content: &str) -> CompressionThresholds {
286    let mut base = thresholds_for_path(path);
287
288    // Apply learned thresholds from feedback loop if available
289    let ext = std::path::Path::new(path)
290        .extension()
291        .and_then(|e| e.to_str())
292        .unwrap_or("");
293    let feedback = super::feedback::FeedbackStore::load();
294    if let Some(learned_entropy) = feedback.get_learned_entropy(ext) {
295        base.bpe_entropy = base.bpe_entropy * 0.6 + learned_entropy * 0.4;
296    }
297    if let Some(learned_jaccard) = feedback.get_learned_jaccard(ext) {
298        base.jaccard = base.jaccard * 0.6 + learned_jaccard * 0.4;
299    }
300
301    if content.len() > 500 {
302        let k = kolmogorov_proxy(content);
303        let k_adjustment = (k - 0.45) * 0.5;
304        base.bpe_entropy = (base.bpe_entropy + k_adjustment).clamp(0.4, 2.0);
305        base.jaccard = (base.jaccard - k_adjustment * 0.3).clamp(0.5, 0.85);
306    }
307
308    base
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn rust_has_lower_threshold_than_python() {
317        let rs = thresholds_for_path("src/main.rs");
318        let py = thresholds_for_path("src/main.py");
319        assert!(rs.bpe_entropy < py.bpe_entropy);
320    }
321
322    #[test]
323    fn json_has_lowest_threshold() {
324        let json = thresholds_for_path("config.json");
325        let rs = thresholds_for_path("main.rs");
326        assert!(json.bpe_entropy < rs.bpe_entropy);
327    }
328
329    #[test]
330    fn unknown_ext_uses_default() {
331        let t = thresholds_for_path("file.xyz");
332        assert!((t.bpe_entropy - 1.0).abs() < f64::EPSILON);
333    }
334
335    #[test]
336    fn adaptive_adjusts_for_compressibility() {
337        let repetitive = "use std::io;\n".repeat(200);
338        let diverse: String = (0..200)
339            .map(|i| format!("let var_{i} = compute_{i}(arg_{i});\n"))
340            .collect();
341
342        let t_rep = adaptive_thresholds("main.rs", &repetitive);
343        let t_div = adaptive_thresholds("main.rs", &diverse);
344        assert!(
345            t_rep.bpe_entropy < t_div.bpe_entropy,
346            "repetitive content should get lower threshold: {} vs {}",
347            t_rep.bpe_entropy,
348            t_div.bpe_entropy
349        );
350    }
351}