1use std::collections::HashMap;
2use std::path::Path;
3
4use super::entropy::kolmogorov_proxy;
5
6#[derive(Debug, Clone)]
7pub struct CompressionThresholds {
8 pub bpe_entropy: f64,
9 pub jaccard: f64,
10 pub auto_delta: f64,
11}
12
13impl Default for CompressionThresholds {
14 fn default() -> Self {
15 Self {
16 bpe_entropy: 1.0,
17 jaccard: 0.7,
18 auto_delta: 0.6,
19 }
20 }
21}
22
23static LANGUAGE_THRESHOLDS: &[(&str, CompressionThresholds)] = &[
24 (
26 "py",
27 CompressionThresholds {
28 bpe_entropy: 1.2,
29 jaccard: 0.65,
30 auto_delta: 0.55,
31 },
32 ),
33 (
36 "rs",
37 CompressionThresholds {
38 bpe_entropy: 0.85,
39 jaccard: 0.55,
40 auto_delta: 0.6,
41 },
42 ),
43 (
45 "ts",
46 CompressionThresholds {
47 bpe_entropy: 0.95,
48 jaccard: 0.68,
49 auto_delta: 0.58,
50 },
51 ),
52 (
53 "tsx",
54 CompressionThresholds {
55 bpe_entropy: 0.95,
56 jaccard: 0.68,
57 auto_delta: 0.58,
58 },
59 ),
60 (
61 "js",
62 CompressionThresholds {
63 bpe_entropy: 1.0,
64 jaccard: 0.68,
65 auto_delta: 0.58,
66 },
67 ),
68 (
69 "jsx",
70 CompressionThresholds {
71 bpe_entropy: 1.0,
72 jaccard: 0.68,
73 auto_delta: 0.58,
74 },
75 ),
76 (
79 "go",
80 CompressionThresholds {
81 bpe_entropy: 0.9,
82 jaccard: 0.55,
83 auto_delta: 0.55,
84 },
85 ),
86 (
89 "java",
90 CompressionThresholds {
91 bpe_entropy: 0.8,
92 jaccard: 0.48,
93 auto_delta: 0.5,
94 },
95 ),
96 (
97 "kt",
98 CompressionThresholds {
99 bpe_entropy: 0.85,
100 jaccard: 0.68,
101 auto_delta: 0.55,
102 },
103 ),
104 (
106 "c",
107 CompressionThresholds {
108 bpe_entropy: 0.9,
109 jaccard: 0.7,
110 auto_delta: 0.6,
111 },
112 ),
113 (
114 "h",
115 CompressionThresholds {
116 bpe_entropy: 0.75,
117 jaccard: 0.65,
118 auto_delta: 0.5,
119 },
120 ),
121 (
122 "cpp",
123 CompressionThresholds {
124 bpe_entropy: 0.9,
125 jaccard: 0.7,
126 auto_delta: 0.6,
127 },
128 ),
129 (
130 "hpp",
131 CompressionThresholds {
132 bpe_entropy: 0.75,
133 jaccard: 0.65,
134 auto_delta: 0.5,
135 },
136 ),
137 (
139 "rb",
140 CompressionThresholds {
141 bpe_entropy: 1.15,
142 jaccard: 0.65,
143 auto_delta: 0.55,
144 },
145 ),
146 (
148 "json",
149 CompressionThresholds {
150 bpe_entropy: 0.6,
151 jaccard: 0.6,
152 auto_delta: 0.4,
153 },
154 ),
155 (
156 "yaml",
157 CompressionThresholds {
158 bpe_entropy: 0.7,
159 jaccard: 0.62,
160 auto_delta: 0.45,
161 },
162 ),
163 (
164 "yml",
165 CompressionThresholds {
166 bpe_entropy: 0.7,
167 jaccard: 0.62,
168 auto_delta: 0.45,
169 },
170 ),
171 (
172 "toml",
173 CompressionThresholds {
174 bpe_entropy: 0.7,
175 jaccard: 0.62,
176 auto_delta: 0.45,
177 },
178 ),
179 (
180 "xml",
181 CompressionThresholds {
182 bpe_entropy: 0.6,
183 jaccard: 0.6,
184 auto_delta: 0.4,
185 },
186 ),
187 (
189 "md",
190 CompressionThresholds {
191 bpe_entropy: 1.3,
192 jaccard: 0.6,
193 auto_delta: 0.55,
194 },
195 ),
196 (
198 "css",
199 CompressionThresholds {
200 bpe_entropy: 0.7,
201 jaccard: 0.6,
202 auto_delta: 0.45,
203 },
204 ),
205 (
206 "scss",
207 CompressionThresholds {
208 bpe_entropy: 0.75,
209 jaccard: 0.62,
210 auto_delta: 0.48,
211 },
212 ),
213 (
215 "sql",
216 CompressionThresholds {
217 bpe_entropy: 0.8,
218 jaccard: 0.65,
219 auto_delta: 0.5,
220 },
221 ),
222 (
224 "sh",
225 CompressionThresholds {
226 bpe_entropy: 1.0,
227 jaccard: 0.68,
228 auto_delta: 0.55,
229 },
230 ),
231 (
232 "bash",
233 CompressionThresholds {
234 bpe_entropy: 1.0,
235 jaccard: 0.68,
236 auto_delta: 0.55,
237 },
238 ),
239 (
241 "swift",
242 CompressionThresholds {
243 bpe_entropy: 0.9,
244 jaccard: 0.68,
245 auto_delta: 0.55,
246 },
247 ),
248 (
249 "cs",
250 CompressionThresholds {
251 bpe_entropy: 0.85,
252 jaccard: 0.65,
253 auto_delta: 0.52,
254 },
255 ),
256 (
258 "php",
259 CompressionThresholds {
260 bpe_entropy: 0.95,
261 jaccard: 0.68,
262 auto_delta: 0.55,
263 },
264 ),
265];
266
267fn language_map() -> HashMap<&'static str, &'static CompressionThresholds> {
268 LANGUAGE_THRESHOLDS
269 .iter()
270 .map(|(ext, t)| (*ext, t))
271 .collect()
272}
273
274pub fn thresholds_for_path(path: &str) -> CompressionThresholds {
275 let ext = Path::new(path)
276 .extension()
277 .and_then(|e| e.to_str())
278 .unwrap_or("");
279
280 let map = language_map();
281 if let Some(t) = map.get(ext) {
282 return (*t).clone();
283 }
284
285 CompressionThresholds::default()
286}
287
288pub fn adaptive_thresholds(path: &str, content: &str) -> CompressionThresholds {
289 let mut base = thresholds_for_path(path);
290
291 let ext = std::path::Path::new(path)
293 .extension()
294 .and_then(|e| e.to_str())
295 .unwrap_or("");
296 let feedback = super::feedback::FeedbackStore::load();
297 if let Some(learned_entropy) = feedback.get_learned_entropy(ext) {
298 base.bpe_entropy = base.bpe_entropy * 0.6 + learned_entropy * 0.4;
299 }
300 if let Some(learned_jaccard) = feedback.get_learned_jaccard(ext) {
301 base.jaccard = base.jaccard * 0.6 + learned_jaccard * 0.4;
302 }
303
304 if content.len() > 500 {
305 let k = kolmogorov_proxy(content);
306 let k_adjustment = (k - 0.45) * 0.5;
307 base.bpe_entropy = (base.bpe_entropy + k_adjustment).clamp(0.4, 2.0);
308 base.jaccard = (base.jaccard - k_adjustment * 0.3).clamp(0.5, 0.85);
309 }
310
311 base
312}
313
314#[cfg(test)]
315mod tests {
316 use super::*;
317
318 #[test]
319 fn rust_has_lower_threshold_than_python() {
320 let rs = thresholds_for_path("src/main.rs");
321 let py = thresholds_for_path("src/main.py");
322 assert!(rs.bpe_entropy < py.bpe_entropy);
323 }
324
325 #[test]
326 fn json_has_lowest_threshold() {
327 let json = thresholds_for_path("config.json");
328 let rs = thresholds_for_path("main.rs");
329 assert!(json.bpe_entropy < rs.bpe_entropy);
330 }
331
332 #[test]
333 fn unknown_ext_uses_default() {
334 let t = thresholds_for_path("file.xyz");
335 assert!((t.bpe_entropy - 1.0).abs() < f64::EPSILON);
336 }
337
338 #[test]
339 fn adaptive_adjusts_for_compressibility() {
340 let repetitive = "use std::io;\n".repeat(200);
341 let diverse: String = (0..200)
342 .map(|i| format!("let var_{i} = compute_{i}(arg_{i});\n"))
343 .collect();
344
345 let t_rep = adaptive_thresholds("main.rs", &repetitive);
346 let t_div = adaptive_thresholds("main.rs", &diverse);
347 assert!(
348 t_rep.bpe_entropy < t_div.bpe_entropy,
349 "repetitive content should get lower threshold: {} vs {}",
350 t_rep.bpe_entropy,
351 t_div.bpe_entropy
352 );
353 }
354}