1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12#[derive(Debug)]
13pub struct EntropyResult {
14 pub output: String,
15 pub original_tokens: usize,
16 pub compressed_tokens: usize,
17 pub techniques: Vec<String>,
18}
19
20impl EntropyResult {
21 pub fn savings_percent(&self) -> f64 {
22 if self.original_tokens == 0 {
23 return 0.0;
24 }
25 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
26 (saved as f64 / self.original_tokens as f64) * 100.0
27 }
28}
29
30pub fn shannon_entropy(text: &str) -> f64 {
31 if text.is_empty() {
32 return 0.0;
33 }
34 let mut freq: HashMap<char, usize> = HashMap::new();
35 let total = text.chars().count();
36
37 for c in text.chars() {
38 *freq.entry(c).or_default() += 1;
39 }
40
41 freq.values().fold(0.0_f64, |acc, &count| {
42 let p = count as f64 / total as f64;
43 acc - p * p.log2()
44 })
45}
46
47pub fn token_entropy(text: &str) -> f64 {
50 let tokens = encode_tokens(text);
51 if tokens.is_empty() {
52 return 0.0;
53 }
54 let total = tokens.len();
55 let mut freq: HashMap<u32, usize> = HashMap::new();
56 for &t in &tokens {
57 *freq.entry(t).or_default() += 1;
58 }
59 freq.values().fold(0.0_f64, |acc, &count| {
60 let p = count as f64 / total as f64;
61 acc - p * p.log2()
62 })
63}
64
65pub fn normalized_token_entropy(text: &str) -> f64 {
69 let tokens = encode_tokens(text);
70 if tokens.is_empty() {
71 return 0.0;
72 }
73 let total = tokens.len();
74 let mut freq: HashMap<u32, usize> = HashMap::new();
75 for &t in &tokens {
76 *freq.entry(t).or_default() += 1;
77 }
78 let n_unique = freq.len();
79 if n_unique <= 1 {
80 return 0.0;
81 }
82 let h = freq.values().fold(0.0_f64, |acc, &count| {
83 let p = count as f64 / total as f64;
84 acc - p * p.log2()
85 });
86 let h_max = (n_unique as f64).log2();
87 h / h_max
88}
89
90pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
91 let set_a: HashSet<&str> = a.split_whitespace().collect();
92 let set_b: HashSet<&str> = b.split_whitespace().collect();
93
94 let intersection = set_a.intersection(&set_b).count();
95 let union = set_a.union(&set_b).count();
96
97 if union == 0 {
98 return 0.0;
99 }
100 intersection as f64 / union as f64
101}
102
103pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
105 let set_a = ngram_set(a, n);
106 let set_b = ngram_set(b, n);
107
108 let intersection = set_a.intersection(&set_b).count();
109 let union = set_a.union(&set_b).count();
110
111 if union == 0 {
112 return 0.0;
113 }
114 intersection as f64 / union as f64
115}
116
117fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
118 let words: Vec<&str> = text.split_whitespace().collect();
119 if words.len() < n {
120 let mut set = HashSet::new();
121 if !words.is_empty() {
122 set.insert(words.iter().map(|w| w.to_string()).collect());
123 }
124 return set;
125 }
126 words
127 .windows(n)
128 .map(|w| w.iter().map(|s| s.to_string()).collect())
129 .collect()
130}
131
132pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
135 let ngrams = ngram_set(text, n);
136 if ngrams.is_empty() {
137 return vec![u64::MAX; k];
138 }
139 let mut signature = vec![u64::MAX; k];
140 for ngram in &ngrams {
141 for (i, min) in signature.iter_mut().enumerate() {
142 let h = hash_with_seed(ngram, i as u64);
143 if h < *min {
144 *min = h;
145 }
146 }
147 }
148 signature
149}
150
151pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
153 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
154 return 0.0;
155 }
156 let matches = sig_a
157 .iter()
158 .zip(sig_b.iter())
159 .filter(|(a, b)| a == b)
160 .count();
161 matches as f64 / sig_a.len() as f64
162}
163
164fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
165 let mut hasher = std::collections::hash_map::DefaultHasher::new();
166 seed.hash(&mut hasher);
167 value.hash(&mut hasher);
168 hasher.finish()
169}
170
171pub fn kolmogorov_proxy(content: &str) -> f64 {
174 if content.is_empty() {
175 return 1.0;
176 }
177 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
178 encoder.write_all(content.as_bytes()).ok();
179 let compressed = encoder.finish().unwrap_or_default();
180 compressed.len() as f64 / content.len() as f64
181}
182
183#[derive(Debug, Clone, Copy, PartialEq, Eq)]
184pub enum CompressibilityClass {
185 High,
186 Medium,
187 Low,
188}
189
190impl CompressibilityClass {
191 pub fn label(&self) -> &'static str {
192 match self {
193 Self::High => "high (K<0.3)",
194 Self::Medium => "medium (0.3≤K<0.6)",
195 Self::Low => "low (K≥0.6)",
196 }
197 }
198}
199
200pub fn compressibility_class(content: &str) -> CompressibilityClass {
202 let k = kolmogorov_proxy(content);
203 if k < 0.3 {
204 CompressibilityClass::High
205 } else if k < 0.6 {
206 CompressibilityClass::Medium
207 } else {
208 CompressibilityClass::Low
209 }
210}
211
212pub fn entropy_compress(content: &str) -> EntropyResult {
213 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
214}
215
216pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
217 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
218 let before_lines = content.lines().count() as u32;
219 let result =
220 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
221 let after_lines = result.output.lines().count() as u32;
222
223 if before_lines != after_lines {
224 super::events::emit(super::events::EventKind::Compression {
225 path: path.to_string(),
226 before_lines,
227 after_lines,
228 strategy: "entropy_adaptive".to_string(),
229 kept_line_count: after_lines,
230 removed_line_count: before_lines.saturating_sub(after_lines),
231 });
232 }
233
234 result
235}
236
237fn entropy_compress_with_thresholds(
238 content: &str,
239 entropy_threshold: f64,
240 jaccard_threshold: f64,
241) -> EntropyResult {
242 let original_tokens = count_tokens(content);
243 let mut lines: Vec<&str> = content.lines().collect();
244 let mut techniques = Vec::new();
245
246 let original_count = lines.len();
247 lines.retain(|line| {
248 let trimmed = line.trim();
249 super::surprise::should_keep_line(trimmed, entropy_threshold)
250 });
251 let removed = original_count - lines.len();
252 if removed > 0 {
253 techniques.push(format!(
254 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
255 ));
256 }
257
258 let blocks = extract_blocks(&lines);
259 let groups = find_pattern_groups(&blocks, jaccard_threshold);
260 let mut dedup_count = 0;
261 for group in &groups {
262 if group.len() > 1 {
263 dedup_count += group.len() - 1;
264 }
265 }
266 if dedup_count > 0 {
267 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
268 }
269
270 let mut result: Vec<String> = Vec::new();
271 let mut skip_indices: HashSet<usize> = HashSet::new();
272 for group in &groups {
273 if group.len() > 1 {
274 for &idx in &group[1..] {
275 skip_indices.insert(idx);
276 }
277 }
278 }
279 for (i, line) in lines.iter().enumerate() {
280 if !skip_indices.contains(&i) {
281 result.push(line.to_string());
282 }
283 }
284
285 let mut collapsed = Vec::new();
286 let mut blank_count = 0;
287 for line in &result {
288 if line.trim().is_empty() {
289 blank_count += 1;
290 if blank_count <= 1 {
291 collapsed.push(line.clone());
292 }
293 } else {
294 blank_count = 0;
295 collapsed.push(line.clone());
296 }
297 }
298
299 let output = collapsed.join("\n");
300 let compressed_tokens = count_tokens(&output);
301
302 EntropyResult {
303 output,
304 original_tokens,
305 compressed_tokens,
306 techniques,
307 }
308}
309
310#[derive(Debug)]
311pub struct EntropyAnalysis {
312 pub avg_entropy: f64,
313 pub low_entropy_count: usize,
314 pub high_entropy_count: usize,
315 pub total_lines: usize,
316}
317
318pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
319 let lines: Vec<&str> = content.lines().collect();
320 let total = lines.len();
321 let mut sum = 0.0;
322 let mut low = 0;
323 let mut high = 0;
324 let mut counted = 0;
325
326 for line in &lines {
327 let trimmed = line.trim();
328 if trimmed.is_empty() {
329 continue;
330 }
331 let h = token_entropy(trimmed);
332 sum += h;
333 counted += 1;
334 if h < BPE_ENTROPY_THRESHOLD {
335 low += 1;
336 }
337 if h > 3.0 {
338 high += 1;
339 }
340 }
341
342 EntropyAnalysis {
343 avg_entropy: if counted > 0 {
344 sum / counted as f64
345 } else {
346 0.0
347 },
348 low_entropy_count: low,
349 high_entropy_count: high,
350 total_lines: total,
351 }
352}
353
354struct Block {
355 content: String,
356}
357
358fn extract_blocks(lines: &[&str]) -> Vec<Block> {
359 let mut blocks = Vec::new();
360 let mut current = String::new();
361
362 for line in lines.iter() {
363 let trimmed = line.trim();
364 if trimmed.is_empty() && !current.is_empty() {
365 blocks.push(Block {
366 content: current.clone(),
367 });
368 current.clear();
369 } else if !trimmed.is_empty() {
370 current.push_str(trimmed);
371 current.push('\n');
372 }
373 }
374
375 if !current.is_empty() {
376 blocks.push(Block { content: current });
377 }
378
379 blocks
380}
381
382fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
383 let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
387 let sizes: Vec<usize> = sets.iter().map(|s| s.len()).collect();
388
389 let mut groups: Vec<Vec<usize>> = Vec::new();
390 let mut assigned: HashSet<usize> = HashSet::new();
391
392 for i in 0..blocks.len() {
393 if assigned.contains(&i) {
394 continue;
395 }
396 let mut group = vec![i];
397 for j in (i + 1)..blocks.len() {
398 if assigned.contains(&j) {
399 continue;
400 }
401 let size_i = sizes[i];
402 let size_j = sizes[j];
403 let min_sz = size_i.min(size_j);
404 let max_sz = size_i.max(size_j);
405 if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
406 continue;
407 }
408 let inter = sets[i].intersection(&sets[j]).count();
409 let union = size_i + size_j - inter;
410 if union > 0 && (inter as f64 / union as f64) >= threshold {
411 group.push(j);
412 assigned.insert(j);
413 }
414 }
415 if group.len() > 1 {
416 assigned.insert(i);
417 }
418 groups.push(group);
419 }
420
421 groups
422}
423
424#[cfg(test)]
425mod tests {
426 use super::*;
427
428 #[test]
429 fn shannon_entropy_empty_is_zero() {
430 assert_eq!(shannon_entropy(""), 0.0);
431 }
432
433 #[test]
434 fn shannon_entropy_single_char() {
435 assert_eq!(shannon_entropy("aaaa"), 0.0);
436 }
437
438 #[test]
439 fn shannon_entropy_high_for_varied_text() {
440 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
441 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
442 assert!(
443 shannon_entropy(varied) > shannon_entropy(uniform),
444 "varied text should have higher entropy"
445 );
446 }
447
448 #[test]
449 fn jaccard_identical_is_one() {
450 let sim = jaccard_similarity("hello world", "hello world");
451 assert!((sim - 1.0).abs() < f64::EPSILON);
452 }
453
454 #[test]
455 fn jaccard_disjoint_is_zero() {
456 let sim = jaccard_similarity("abc", "xyz");
457 assert_eq!(sim, 0.0);
458 }
459
460 #[test]
461 fn jaccard_partial_overlap() {
462 let sim = jaccard_similarity("hello world", "hello rust");
463 assert!(sim > 0.0 && sim < 1.0);
464 }
465
466 #[test]
467 fn entropy_compress_produces_output() {
468 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
469 let result = entropy_compress(content);
470 assert!(!result.output.is_empty(), "should produce non-empty output");
471 assert!(result.compressed_tokens <= result.original_tokens);
472 }
473
474 #[test]
475 fn entropy_result_savings() {
476 let r = EntropyResult {
477 output: "short".to_string(),
478 original_tokens: 100,
479 compressed_tokens: 60,
480 techniques: vec!["test".to_string()],
481 };
482 assert!((r.savings_percent() - 40.0).abs() < 0.1);
483 }
484
485 #[test]
486 fn entropy_result_zero_original() {
487 let r = EntropyResult {
488 output: String::new(),
489 original_tokens: 0,
490 compressed_tokens: 0,
491 techniques: vec![],
492 };
493 assert_eq!(r.savings_percent(), 0.0);
494 }
495
496 #[test]
497 fn token_entropy_empty_is_zero() {
498 assert_eq!(token_entropy(""), 0.0);
499 }
500
501 #[test]
502 fn token_entropy_single_repeated_token() {
503 assert_eq!(token_entropy("}"), 0.0);
504 }
505
506 #[test]
507 fn token_entropy_higher_for_diverse_code() {
508 let diverse = "let result = compute_something(x, y, z);";
509 let repetitive = "aaaa aaaa aaaa aaaa";
510 assert!(
511 token_entropy(diverse) > token_entropy(repetitive),
512 "diverse code should have higher BPE token entropy"
513 );
514 }
515
516 #[test]
517 fn token_entropy_vs_char_entropy_differ() {
518 let code = "fn main() { println!(\"hello world\"); }";
519 let te = token_entropy(code);
520 let ce = shannon_entropy(code);
521 assert!(te != ce, "BPE and char entropy should differ for code");
522 }
523
524 #[test]
525 fn ngram_jaccard_preserves_order() {
526 let a = "a b c d";
527 let b = "d c b a";
528 let word_j = jaccard_similarity(a, b);
529 let ngram_j = ngram_jaccard(a, b, 2);
530 assert!(
531 ngram_j < word_j,
532 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
533 );
534 }
535
536 #[test]
537 fn ngram_jaccard_identical_is_one() {
538 let text = "fn main() { println!(\"hello\"); }";
539 let j = ngram_jaccard(text, text, 2);
540 assert!((j - 1.0).abs() < f64::EPSILON);
541 }
542
543 #[test]
544 fn ngram_jaccard_disjoint_is_zero() {
545 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
546 assert_eq!(j, 0.0);
547 }
548
549 #[test]
550 fn minhash_approximates_jaccard() {
551 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
552 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
553 let exact = ngram_jaccard(a, b, 2);
554 let sig_a = minhash_signature(a, 2, 128);
555 let sig_b = minhash_signature(b, 2, 128);
556 let approx = minhash_similarity(&sig_a, &sig_b);
557 assert!(
558 (exact - approx).abs() < 0.2,
559 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
560 );
561 }
562
563 #[test]
564 fn minhash_empty_text() {
565 let sig = minhash_signature("", 2, 64);
566 assert!(sig.iter().all(|&v| v == u64::MAX));
567 }
568
569 #[test]
570 fn kolmogorov_empty_is_one() {
571 assert_eq!(kolmogorov_proxy(""), 1.0);
572 }
573
574 #[test]
575 fn kolmogorov_repetitive_is_low() {
576 let repetitive = "aaa\n".repeat(1000);
577 let k = kolmogorov_proxy(&repetitive);
578 assert!(
579 k < 0.1,
580 "highly repetitive text should compress well: K={k}"
581 );
582 }
583
584 #[test]
585 fn kolmogorov_diverse_is_higher() {
586 let repetitive = "aaa\n".repeat(500);
587 let diverse = (0..500)
588 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
589 .collect::<Vec<_>>()
590 .join("\n");
591 assert!(
592 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
593 "diverse content should have higher K than repetitive"
594 );
595 }
596
597 #[test]
598 fn compressibility_class_repetitive_is_high() {
599 let text = "use std::io;\n".repeat(200);
600 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
601 }
602
603 #[test]
604 fn kolmogorov_diverse_higher_than_repetitive() {
605 let rep = "test\n".repeat(500);
606 let diverse = (0..500)
607 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
608 .collect::<Vec<_>>()
609 .join("\n");
610 assert!(
611 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
612 "diverse content should have higher K"
613 );
614 }
615}