1use std::collections::HashMap;
2use std::path::Path;
3
4use super::entropy::kolmogorov_proxy;
5
6#[derive(Debug, Clone)]
7pub struct CompressionThresholds {
8 pub bpe_entropy: f64,
9 pub jaccard: f64,
10 pub auto_delta: f64,
11}
12
13impl Default for CompressionThresholds {
14 fn default() -> Self {
15 Self {
16 bpe_entropy: 1.0,
17 jaccard: 0.7,
18 auto_delta: 0.6,
19 }
20 }
21}
22
23static LANGUAGE_THRESHOLDS: &[(&str, CompressionThresholds)] = &[
24 (
26 "py",
27 CompressionThresholds {
28 bpe_entropy: 1.2,
29 jaccard: 0.65,
30 auto_delta: 0.55,
31 },
32 ),
33 (
35 "rs",
36 CompressionThresholds {
37 bpe_entropy: 0.85,
38 jaccard: 0.72,
39 auto_delta: 0.6,
40 },
41 ),
42 (
44 "ts",
45 CompressionThresholds {
46 bpe_entropy: 0.95,
47 jaccard: 0.68,
48 auto_delta: 0.58,
49 },
50 ),
51 (
52 "tsx",
53 CompressionThresholds {
54 bpe_entropy: 0.95,
55 jaccard: 0.68,
56 auto_delta: 0.58,
57 },
58 ),
59 (
60 "js",
61 CompressionThresholds {
62 bpe_entropy: 1.0,
63 jaccard: 0.68,
64 auto_delta: 0.58,
65 },
66 ),
67 (
68 "jsx",
69 CompressionThresholds {
70 bpe_entropy: 1.0,
71 jaccard: 0.68,
72 auto_delta: 0.58,
73 },
74 ),
75 (
77 "go",
78 CompressionThresholds {
79 bpe_entropy: 0.9,
80 jaccard: 0.72,
81 auto_delta: 0.55,
82 },
83 ),
84 (
86 "java",
87 CompressionThresholds {
88 bpe_entropy: 0.8,
89 jaccard: 0.65,
90 auto_delta: 0.5,
91 },
92 ),
93 (
94 "kt",
95 CompressionThresholds {
96 bpe_entropy: 0.85,
97 jaccard: 0.68,
98 auto_delta: 0.55,
99 },
100 ),
101 (
103 "c",
104 CompressionThresholds {
105 bpe_entropy: 0.9,
106 jaccard: 0.7,
107 auto_delta: 0.6,
108 },
109 ),
110 (
111 "h",
112 CompressionThresholds {
113 bpe_entropy: 0.75,
114 jaccard: 0.65,
115 auto_delta: 0.5,
116 },
117 ),
118 (
119 "cpp",
120 CompressionThresholds {
121 bpe_entropy: 0.9,
122 jaccard: 0.7,
123 auto_delta: 0.6,
124 },
125 ),
126 (
127 "hpp",
128 CompressionThresholds {
129 bpe_entropy: 0.75,
130 jaccard: 0.65,
131 auto_delta: 0.5,
132 },
133 ),
134 (
136 "rb",
137 CompressionThresholds {
138 bpe_entropy: 1.15,
139 jaccard: 0.65,
140 auto_delta: 0.55,
141 },
142 ),
143 (
145 "json",
146 CompressionThresholds {
147 bpe_entropy: 0.6,
148 jaccard: 0.6,
149 auto_delta: 0.4,
150 },
151 ),
152 (
153 "yaml",
154 CompressionThresholds {
155 bpe_entropy: 0.7,
156 jaccard: 0.62,
157 auto_delta: 0.45,
158 },
159 ),
160 (
161 "yml",
162 CompressionThresholds {
163 bpe_entropy: 0.7,
164 jaccard: 0.62,
165 auto_delta: 0.45,
166 },
167 ),
168 (
169 "toml",
170 CompressionThresholds {
171 bpe_entropy: 0.7,
172 jaccard: 0.62,
173 auto_delta: 0.45,
174 },
175 ),
176 (
177 "xml",
178 CompressionThresholds {
179 bpe_entropy: 0.6,
180 jaccard: 0.6,
181 auto_delta: 0.4,
182 },
183 ),
184 (
186 "md",
187 CompressionThresholds {
188 bpe_entropy: 1.3,
189 jaccard: 0.6,
190 auto_delta: 0.55,
191 },
192 ),
193 (
195 "css",
196 CompressionThresholds {
197 bpe_entropy: 0.7,
198 jaccard: 0.6,
199 auto_delta: 0.45,
200 },
201 ),
202 (
203 "scss",
204 CompressionThresholds {
205 bpe_entropy: 0.75,
206 jaccard: 0.62,
207 auto_delta: 0.48,
208 },
209 ),
210 (
212 "sql",
213 CompressionThresholds {
214 bpe_entropy: 0.8,
215 jaccard: 0.65,
216 auto_delta: 0.5,
217 },
218 ),
219 (
221 "sh",
222 CompressionThresholds {
223 bpe_entropy: 1.0,
224 jaccard: 0.68,
225 auto_delta: 0.55,
226 },
227 ),
228 (
229 "bash",
230 CompressionThresholds {
231 bpe_entropy: 1.0,
232 jaccard: 0.68,
233 auto_delta: 0.55,
234 },
235 ),
236 (
238 "swift",
239 CompressionThresholds {
240 bpe_entropy: 0.9,
241 jaccard: 0.68,
242 auto_delta: 0.55,
243 },
244 ),
245 (
246 "cs",
247 CompressionThresholds {
248 bpe_entropy: 0.85,
249 jaccard: 0.65,
250 auto_delta: 0.52,
251 },
252 ),
253 (
255 "php",
256 CompressionThresholds {
257 bpe_entropy: 0.95,
258 jaccard: 0.68,
259 auto_delta: 0.55,
260 },
261 ),
262];
263
264fn language_map() -> HashMap<&'static str, &'static CompressionThresholds> {
265 LANGUAGE_THRESHOLDS
266 .iter()
267 .map(|(ext, t)| (*ext, t))
268 .collect()
269}
270
271pub fn thresholds_for_path(path: &str) -> CompressionThresholds {
272 let ext = Path::new(path)
273 .extension()
274 .and_then(|e| e.to_str())
275 .unwrap_or("");
276
277 let map = language_map();
278 if let Some(t) = map.get(ext) {
279 return (*t).clone();
280 }
281
282 CompressionThresholds::default()
283}
284
285pub fn adaptive_thresholds(path: &str, content: &str) -> CompressionThresholds {
286 let mut base = thresholds_for_path(path);
287
288 let ext = std::path::Path::new(path)
290 .extension()
291 .and_then(|e| e.to_str())
292 .unwrap_or("");
293 let feedback = super::feedback::FeedbackStore::load();
294 if let Some(learned_entropy) = feedback.get_learned_entropy(ext) {
295 base.bpe_entropy = base.bpe_entropy * 0.6 + learned_entropy * 0.4;
296 }
297 if let Some(learned_jaccard) = feedback.get_learned_jaccard(ext) {
298 base.jaccard = base.jaccard * 0.6 + learned_jaccard * 0.4;
299 }
300
301 if content.len() > 500 {
302 let k = kolmogorov_proxy(content);
303 let k_adjustment = (k - 0.45) * 0.5;
304 base.bpe_entropy = (base.bpe_entropy + k_adjustment).clamp(0.4, 2.0);
305 base.jaccard = (base.jaccard - k_adjustment * 0.3).clamp(0.5, 0.85);
306 }
307
308 base
309}
310
311#[cfg(test)]
312mod tests {
313 use super::*;
314
315 #[test]
316 fn rust_has_lower_threshold_than_python() {
317 let rs = thresholds_for_path("src/main.rs");
318 let py = thresholds_for_path("src/main.py");
319 assert!(rs.bpe_entropy < py.bpe_entropy);
320 }
321
322 #[test]
323 fn json_has_lowest_threshold() {
324 let json = thresholds_for_path("config.json");
325 let rs = thresholds_for_path("main.rs");
326 assert!(json.bpe_entropy < rs.bpe_entropy);
327 }
328
329 #[test]
330 fn unknown_ext_uses_default() {
331 let t = thresholds_for_path("file.xyz");
332 assert!((t.bpe_entropy - 1.0).abs() < f64::EPSILON);
333 }
334
335 #[test]
336 fn adaptive_adjusts_for_compressibility() {
337 let repetitive = "use std::io;\n".repeat(200);
338 let diverse: String = (0..200)
339 .map(|i| format!("let var_{i} = compute_{i}(arg_{i});\n"))
340 .collect();
341
342 let t_rep = adaptive_thresholds("main.rs", &repetitive);
343 let t_div = adaptive_thresholds("main.rs", &diverse);
344 assert!(
345 t_rep.bpe_entropy < t_div.bpe_entropy,
346 "repetitive content should get lower threshold: {} vs {}",
347 t_rep.bpe_entropy,
348 t_div.bpe_entropy
349 );
350 }
351}