1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12#[derive(Debug)]
14pub struct EntropyResult {
15 pub output: String,
16 pub original_tokens: usize,
17 pub compressed_tokens: usize,
18 pub techniques: Vec<String>,
19}
20
21impl EntropyResult {
22 pub fn savings_percent(&self) -> f64 {
24 if self.original_tokens == 0 {
25 return 0.0;
26 }
27 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28 (saved as f64 / self.original_tokens as f64) * 100.0
29 }
30}
31
32pub fn shannon_entropy(text: &str) -> f64 {
34 if text.is_empty() {
35 return 0.0;
36 }
37 let mut freq: HashMap<char, usize> = HashMap::new();
38 let total = text.chars().count();
39
40 for c in text.chars() {
41 *freq.entry(c).or_default() += 1;
42 }
43
44 freq.values().fold(0.0_f64, |acc, &count| {
45 let p = count as f64 / total as f64;
46 acc - p * p.log2()
47 })
48}
49
50pub fn token_entropy_from_ids(tokens: &[u32]) -> f64 {
52 if tokens.is_empty() {
53 return 0.0;
54 }
55 let total = tokens.len();
56 let mut freq: HashMap<u32, usize> = HashMap::new();
57 for &t in tokens {
58 *freq.entry(t).or_default() += 1;
59 }
60 freq.values().fold(0.0_f64, |acc, &count| {
61 let p = count as f64 / total as f64;
62 acc - p * p.log2()
63 })
64}
65
66pub fn token_entropy(text: &str) -> f64 {
69 let tokens = encode_tokens(text);
70 token_entropy_from_ids(&tokens)
71}
72
73pub fn normalized_token_entropy_from_ids(tokens: &[u32]) -> f64 {
75 if tokens.is_empty() {
76 return 0.0;
77 }
78 let total = tokens.len();
79 let mut freq: HashMap<u32, usize> = HashMap::new();
80 for &t in tokens {
81 *freq.entry(t).or_default() += 1;
82 }
83 let n_unique = freq.len();
84 if n_unique <= 1 {
85 return 0.0;
86 }
87 let h = freq.values().fold(0.0_f64, |acc, &count| {
88 let p = count as f64 / total as f64;
89 acc - p * p.log2()
90 });
91 let h_max = (n_unique as f64).log2();
92 h / h_max
93}
94
95pub fn normalized_token_entropy(text: &str) -> f64 {
99 let tokens = encode_tokens(text);
100 normalized_token_entropy_from_ids(&tokens)
101}
102
103pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
105 let set_a: HashSet<&str> = a.split_whitespace().collect();
106 let set_b: HashSet<&str> = b.split_whitespace().collect();
107
108 let intersection = set_a.intersection(&set_b).count();
109 let union = set_a.union(&set_b).count();
110
111 if union == 0 {
112 return 0.0;
113 }
114 intersection as f64 / union as f64
115}
116
117pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
119 let set_a = ngram_set(a, n);
120 let set_b = ngram_set(b, n);
121
122 let intersection = set_a.intersection(&set_b).count();
123 let union = set_a.union(&set_b).count();
124
125 if union == 0 {
126 return 0.0;
127 }
128 intersection as f64 / union as f64
129}
130
131fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
132 let words: Vec<&str> = text.split_whitespace().collect();
133 if words.len() < n {
134 let mut set = HashSet::new();
135 if !words.is_empty() {
136 set.insert(words.iter().map(std::string::ToString::to_string).collect());
137 }
138 return set;
139 }
140 words
141 .windows(n)
142 .map(|w| w.iter().map(std::string::ToString::to_string).collect())
143 .collect()
144}
145
146pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
149 let ngrams = ngram_set(text, n);
150 if ngrams.is_empty() {
151 return vec![u64::MAX; k];
152 }
153 let mut signature = vec![u64::MAX; k];
154 for ngram in &ngrams {
155 for (i, min) in signature.iter_mut().enumerate() {
156 let h = hash_with_seed(ngram, i as u64);
157 if h < *min {
158 *min = h;
159 }
160 }
161 }
162 signature
163}
164
165pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
167 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
168 return 0.0;
169 }
170 let matches = sig_a
171 .iter()
172 .zip(sig_b.iter())
173 .filter(|(a, b)| a == b)
174 .count();
175 matches as f64 / sig_a.len() as f64
176}
177
178fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
179 let mut hasher = std::collections::hash_map::DefaultHasher::new();
180 seed.hash(&mut hasher);
181 value.hash(&mut hasher);
182 hasher.finish()
183}
184
185pub fn kolmogorov_proxy(content: &str) -> f64 {
188 if content.is_empty() {
189 return 1.0;
190 }
191 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
192 encoder.write_all(content.as_bytes()).ok();
193 let compressed = encoder.finish().unwrap_or_default();
194 compressed.len() as f64 / content.len() as f64
195}
196
197#[derive(Debug, Clone, Copy, PartialEq, Eq)]
199pub enum CompressibilityClass {
200 High,
201 Medium,
202 Low,
203}
204
205impl CompressibilityClass {
206 pub fn label(&self) -> &'static str {
208 match self {
209 Self::High => "high (K<0.3)",
210 Self::Medium => "medium (0.3≤K<0.6)",
211 Self::Low => "low (K≥0.6)",
212 }
213 }
214}
215
216pub fn compressibility_class(content: &str) -> CompressibilityClass {
218 let k = kolmogorov_proxy(content);
219 if k < 0.3 {
220 CompressibilityClass::High
221 } else if k < 0.6 {
222 CompressibilityClass::Medium
223 } else {
224 CompressibilityClass::Low
225 }
226}
227
228pub fn entropy_compress(content: &str) -> EntropyResult {
230 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
231}
232
233pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
235 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
236 let before_lines = content.lines().count() as u32;
237 let result =
238 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
239 let after_lines = result.output.lines().count() as u32;
240
241 if before_lines != after_lines {
242 super::events::emit(super::events::EventKind::Compression {
243 path: path.to_string(),
244 before_lines,
245 after_lines,
246 strategy: "entropy_adaptive".to_string(),
247 kept_line_count: after_lines,
248 removed_line_count: before_lines.saturating_sub(after_lines),
249 });
250 }
251
252 result
253}
254
255pub fn entropy_compress_task_conditioned(
261 content: &str,
262 path: &str,
263 task_keywords: &[String],
264) -> EntropyResult {
265 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
266 let before_lines = content.lines().count() as u32;
267 let result = entropy_compress_with_task(
268 content,
269 thresholds.bpe_entropy,
270 thresholds.jaccard,
271 task_keywords,
272 );
273 let after_lines = result.output.lines().count() as u32;
274 if before_lines != after_lines {
275 super::events::emit(super::events::EventKind::Compression {
276 path: path.to_string(),
277 before_lines,
278 after_lines,
279 strategy: "entropy_task_conditioned".to_string(),
280 kept_line_count: after_lines,
281 removed_line_count: before_lines.saturating_sub(after_lines),
282 });
283 }
284 result
285}
286
287fn entropy_compress_with_task(
288 content: &str,
289 entropy_threshold: f64,
290 jaccard_threshold: f64,
291 task_keywords: &[String],
292) -> EntropyResult {
293 let original_tokens = count_tokens(content);
294 let mut lines: Vec<&str> = content.lines().collect();
295 let mut techniques = Vec::new();
296
297 let kw_lower: Vec<String> = task_keywords.iter().map(|k| k.to_lowercase()).collect();
298 let original_count = lines.len();
299 let mut task_rescued = 0usize;
300 lines.retain(|line| {
301 let trimmed = line.trim();
302 if super::surprise::should_keep_line(trimmed, entropy_threshold) {
303 return true;
304 }
305 if !kw_lower.is_empty() {
307 let lower = trimmed.to_lowercase();
308 if kw_lower.iter().any(|kw| lower.contains(kw.as_str())) {
309 task_rescued += 1;
310 return true;
311 }
312 }
313 false
314 });
315 let removed = original_count - lines.len();
316 if removed > 0 || task_rescued > 0 {
317 let mut msg = format!("⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})");
318 if task_rescued > 0 {
319 msg.push_str(&format!(" [+{task_rescued} task-rescued]"));
320 }
321 techniques.push(msg);
322 }
323
324 let blocks = extract_blocks(&lines);
325 let groups = find_pattern_groups(&blocks, jaccard_threshold);
326 let mut dedup_count = 0;
327 for group in &groups {
328 if group.len() > 1 {
329 dedup_count += group.len() - 1;
330 }
331 }
332 if dedup_count > 0 {
333 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
334 }
335
336 let mut result: Vec<String> = Vec::new();
337 let mut skip_indices: std::collections::HashSet<usize> = std::collections::HashSet::new();
338 for group in &groups {
339 if group.len() > 1 {
340 for &idx in &group[1..] {
341 skip_indices.insert(idx);
342 }
343 }
344 }
345 for (i, line) in lines.iter().enumerate() {
346 if !skip_indices.contains(&i) {
347 result.push(line.to_string());
348 }
349 }
350
351 let mut collapsed = Vec::new();
352 let mut blank_count = 0;
353 for line in &result {
354 if line.trim().is_empty() {
355 blank_count += 1;
356 if blank_count <= 1 {
357 collapsed.push(line.clone());
358 }
359 } else {
360 blank_count = 0;
361 collapsed.push(line.clone());
362 }
363 }
364 let output = collapsed.join("\n");
365 let compressed_tokens = count_tokens(&output);
366
367 let final_output = if compressed_tokens > original_tokens {
371 content.to_string()
372 } else {
373 output
374 };
375 let final_tokens = if compressed_tokens > original_tokens {
376 original_tokens
377 } else {
378 compressed_tokens
379 };
380
381 EntropyResult {
382 output: final_output,
383 original_tokens,
384 compressed_tokens: final_tokens,
385 techniques,
386 }
387}
388
389fn entropy_compress_with_thresholds(
390 content: &str,
391 entropy_threshold: f64,
392 jaccard_threshold: f64,
393) -> EntropyResult {
394 entropy_compress_with_task(content, entropy_threshold, jaccard_threshold, &[])
395}
396
397#[derive(Debug)]
399pub struct EntropyAnalysis {
400 pub avg_entropy: f64,
401 pub low_entropy_count: usize,
402 pub high_entropy_count: usize,
403 pub total_lines: usize,
404}
405
406pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
408 let lines: Vec<&str> = content.lines().collect();
409 let total = lines.len();
410 let mut sum = 0.0;
411 let mut low = 0;
412 let mut high = 0;
413 let mut counted = 0;
414
415 for line in &lines {
416 let trimmed = line.trim();
417 if trimmed.is_empty() {
418 continue;
419 }
420 let h = token_entropy(trimmed);
421 sum += h;
422 counted += 1;
423 if h < BPE_ENTROPY_THRESHOLD {
424 low += 1;
425 }
426 if h > 3.0 {
427 high += 1;
428 }
429 }
430
431 EntropyAnalysis {
432 avg_entropy: if counted > 0 {
433 sum / counted as f64
434 } else {
435 0.0
436 },
437 low_entropy_count: low,
438 high_entropy_count: high,
439 total_lines: total,
440 }
441}
442
443struct Block {
444 content: String,
445}
446
447fn extract_blocks(lines: &[&str]) -> Vec<Block> {
448 let mut blocks = Vec::new();
449 let mut current = String::new();
450
451 for line in lines {
452 let trimmed = line.trim();
453 if trimmed.is_empty() && !current.is_empty() {
454 blocks.push(Block {
455 content: current.clone(),
456 });
457 current.clear();
458 } else if !trimmed.is_empty() {
459 current.push_str(trimmed);
460 current.push('\n');
461 }
462 }
463
464 if !current.is_empty() {
465 blocks.push(Block { content: current });
466 }
467
468 blocks
469}
470
471fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
472 let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
476 let sizes: Vec<usize> = sets.iter().map(std::collections::HashSet::len).collect();
477
478 let mut groups: Vec<Vec<usize>> = Vec::new();
479 let mut assigned: HashSet<usize> = HashSet::new();
480
481 for i in 0..blocks.len() {
482 if assigned.contains(&i) {
483 continue;
484 }
485 let mut group = vec![i];
486 for j in (i + 1)..blocks.len() {
487 if assigned.contains(&j) {
488 continue;
489 }
490 let size_i = sizes[i];
491 let size_j = sizes[j];
492 let min_sz = size_i.min(size_j);
493 let max_sz = size_i.max(size_j);
494 if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
495 continue;
496 }
497 let inter = sets[i].intersection(&sets[j]).count();
498 let union = size_i + size_j - inter;
499 if union > 0 && (inter as f64 / union as f64) >= threshold {
500 group.push(j);
501 assigned.insert(j);
502 }
503 }
504 if group.len() > 1 {
505 assigned.insert(i);
506 }
507 groups.push(group);
508 }
509
510 groups
511}
512
513#[cfg(test)]
514mod tests {
515 use super::*;
516
517 #[test]
518 fn shannon_entropy_empty_is_zero() {
519 assert_eq!(shannon_entropy(""), 0.0);
520 }
521
522 #[test]
523 fn shannon_entropy_single_char() {
524 assert_eq!(shannon_entropy("aaaa"), 0.0);
525 }
526
527 #[test]
528 fn shannon_entropy_high_for_varied_text() {
529 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
530 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
531 assert!(
532 shannon_entropy(varied) > shannon_entropy(uniform),
533 "varied text should have higher entropy"
534 );
535 }
536
537 #[test]
538 fn jaccard_identical_is_one() {
539 let sim = jaccard_similarity("hello world", "hello world");
540 assert!((sim - 1.0).abs() < f64::EPSILON);
541 }
542
543 #[test]
544 fn jaccard_disjoint_is_zero() {
545 let sim = jaccard_similarity("abc", "xyz");
546 assert_eq!(sim, 0.0);
547 }
548
549 #[test]
550 fn jaccard_partial_overlap() {
551 let sim = jaccard_similarity("hello world", "hello rust");
552 assert!(sim > 0.0 && sim < 1.0);
553 }
554
555 #[test]
556 fn entropy_compress_produces_output() {
557 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
558 let result = entropy_compress(content);
559 assert!(!result.output.is_empty(), "should produce non-empty output");
560 assert!(result.compressed_tokens <= result.original_tokens);
561 }
562
563 #[test]
564 fn entropy_result_savings() {
565 let r = EntropyResult {
566 output: "short".to_string(),
567 original_tokens: 100,
568 compressed_tokens: 60,
569 techniques: vec!["test".to_string()],
570 };
571 assert!((r.savings_percent() - 40.0).abs() < 0.1);
572 }
573
574 #[test]
575 fn entropy_result_zero_original() {
576 let r = EntropyResult {
577 output: String::new(),
578 original_tokens: 0,
579 compressed_tokens: 0,
580 techniques: vec![],
581 };
582 assert_eq!(r.savings_percent(), 0.0);
583 }
584
585 #[test]
586 fn token_entropy_empty_is_zero() {
587 assert_eq!(token_entropy(""), 0.0);
588 }
589
590 #[test]
591 fn token_entropy_single_repeated_token() {
592 assert_eq!(token_entropy("}"), 0.0);
593 }
594
595 #[test]
596 fn token_entropy_higher_for_diverse_code() {
597 let diverse = "let result = compute_something(x, y, z);";
598 let repetitive = "aaaa aaaa aaaa aaaa";
599 assert!(
600 token_entropy(diverse) > token_entropy(repetitive),
601 "diverse code should have higher BPE token entropy"
602 );
603 }
604
605 #[test]
606 fn token_entropy_vs_char_entropy_differ() {
607 let code = "fn main() { println!(\"hello world\"); }";
608 let te = token_entropy(code);
609 let ce = shannon_entropy(code);
610 assert!(te != ce, "BPE and char entropy should differ for code");
611 }
612
613 #[test]
614 fn ngram_jaccard_preserves_order() {
615 let a = "a b c d";
616 let b = "d c b a";
617 let word_j = jaccard_similarity(a, b);
618 let ngram_j = ngram_jaccard(a, b, 2);
619 assert!(
620 ngram_j < word_j,
621 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
622 );
623 }
624
625 #[test]
626 fn ngram_jaccard_identical_is_one() {
627 let text = "fn main() { println!(\"hello\"); }";
628 let j = ngram_jaccard(text, text, 2);
629 assert!((j - 1.0).abs() < f64::EPSILON);
630 }
631
632 #[test]
633 fn ngram_jaccard_disjoint_is_zero() {
634 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
635 assert_eq!(j, 0.0);
636 }
637
638 #[test]
639 fn minhash_approximates_jaccard() {
640 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
641 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
642 let exact = ngram_jaccard(a, b, 2);
643 let sig_a = minhash_signature(a, 2, 128);
644 let sig_b = minhash_signature(b, 2, 128);
645 let approx = minhash_similarity(&sig_a, &sig_b);
646 assert!(
647 (exact - approx).abs() < 0.2,
648 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
649 );
650 }
651
652 #[test]
653 fn minhash_empty_text() {
654 let sig = minhash_signature("", 2, 64);
655 assert!(sig.iter().all(|&v| v == u64::MAX));
656 }
657
658 #[test]
659 fn kolmogorov_empty_is_one() {
660 assert_eq!(kolmogorov_proxy(""), 1.0);
661 }
662
663 #[test]
664 fn kolmogorov_repetitive_is_low() {
665 let repetitive = "aaa\n".repeat(1000);
666 let k = kolmogorov_proxy(&repetitive);
667 assert!(
668 k < 0.1,
669 "highly repetitive text should compress well: K={k}"
670 );
671 }
672
673 #[test]
674 fn kolmogorov_diverse_is_higher() {
675 let repetitive = "aaa\n".repeat(500);
676 let diverse = (0..500)
677 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
678 .collect::<Vec<_>>()
679 .join("\n");
680 assert!(
681 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
682 "diverse content should have higher K than repetitive"
683 );
684 }
685
686 #[test]
687 fn compressibility_class_repetitive_is_high() {
688 let text = "use std::io;\n".repeat(200);
689 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
690 }
691
692 #[test]
693 fn kolmogorov_diverse_higher_than_repetitive() {
694 let rep = "test\n".repeat(500);
695 let diverse = (0..500)
696 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
697 .collect::<Vec<_>>()
698 .join("\n");
699 assert!(
700 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
701 "diverse content should have higher K"
702 );
703 }
704}