1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12#[derive(Debug)]
14pub struct EntropyResult {
15 pub output: String,
16 pub original_tokens: usize,
17 pub compressed_tokens: usize,
18 pub techniques: Vec<String>,
19}
20
21impl EntropyResult {
22 pub fn savings_percent(&self) -> f64 {
24 if self.original_tokens == 0 {
25 return 0.0;
26 }
27 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28 (saved as f64 / self.original_tokens as f64) * 100.0
29 }
30}
31
32pub fn shannon_entropy(text: &str) -> f64 {
34 if text.is_empty() {
35 return 0.0;
36 }
37 let mut freq: HashMap<char, usize> = HashMap::new();
38 let total = text.chars().count();
39
40 for c in text.chars() {
41 *freq.entry(c).or_default() += 1;
42 }
43
44 freq.values().fold(0.0_f64, |acc, &count| {
45 let p = count as f64 / total as f64;
46 acc - p * p.log2()
47 })
48}
49
50pub fn token_entropy_from_ids(tokens: &[u32]) -> f64 {
52 if tokens.is_empty() {
53 return 0.0;
54 }
55 let total = tokens.len();
56 let mut freq: HashMap<u32, usize> = HashMap::new();
57 for &t in tokens {
58 *freq.entry(t).or_default() += 1;
59 }
60 freq.values().fold(0.0_f64, |acc, &count| {
61 let p = count as f64 / total as f64;
62 acc - p * p.log2()
63 })
64}
65
66pub fn token_entropy(text: &str) -> f64 {
69 let tokens = encode_tokens(text);
70 token_entropy_from_ids(&tokens)
71}
72
73pub fn normalized_token_entropy_from_ids(tokens: &[u32]) -> f64 {
75 if tokens.is_empty() {
76 return 0.0;
77 }
78 let total = tokens.len();
79 let mut freq: HashMap<u32, usize> = HashMap::new();
80 for &t in tokens {
81 *freq.entry(t).or_default() += 1;
82 }
83 let n_unique = freq.len();
84 if n_unique <= 1 {
85 return 0.0;
86 }
87 let h = freq.values().fold(0.0_f64, |acc, &count| {
88 let p = count as f64 / total as f64;
89 acc - p * p.log2()
90 });
91 let h_max = (n_unique as f64).log2();
92 h / h_max
93}
94
95pub fn normalized_token_entropy(text: &str) -> f64 {
99 let tokens = encode_tokens(text);
100 normalized_token_entropy_from_ids(&tokens)
101}
102
103pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
105 let set_a: HashSet<&str> = a.split_whitespace().collect();
106 let set_b: HashSet<&str> = b.split_whitespace().collect();
107
108 let intersection = set_a.intersection(&set_b).count();
109 let union = set_a.union(&set_b).count();
110
111 if union == 0 {
112 return 0.0;
113 }
114 intersection as f64 / union as f64
115}
116
117pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
119 let set_a = ngram_set(a, n);
120 let set_b = ngram_set(b, n);
121
122 let intersection = set_a.intersection(&set_b).count();
123 let union = set_a.union(&set_b).count();
124
125 if union == 0 {
126 return 0.0;
127 }
128 intersection as f64 / union as f64
129}
130
131fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
132 let words: Vec<&str> = text.split_whitespace().collect();
133 if words.len() < n {
134 let mut set = HashSet::new();
135 if !words.is_empty() {
136 set.insert(words.iter().map(std::string::ToString::to_string).collect());
137 }
138 return set;
139 }
140 words
141 .windows(n)
142 .map(|w| w.iter().map(std::string::ToString::to_string).collect())
143 .collect()
144}
145
146pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
149 let ngrams = ngram_set(text, n);
150 if ngrams.is_empty() {
151 return vec![u64::MAX; k];
152 }
153 let mut signature = vec![u64::MAX; k];
154 for ngram in &ngrams {
155 for (i, min) in signature.iter_mut().enumerate() {
156 let h = hash_with_seed(ngram, i as u64);
157 if h < *min {
158 *min = h;
159 }
160 }
161 }
162 signature
163}
164
165pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
167 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
168 return 0.0;
169 }
170 let matches = sig_a
171 .iter()
172 .zip(sig_b.iter())
173 .filter(|(a, b)| a == b)
174 .count();
175 matches as f64 / sig_a.len() as f64
176}
177
178fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
179 let mut hasher = std::collections::hash_map::DefaultHasher::new();
180 seed.hash(&mut hasher);
181 value.hash(&mut hasher);
182 hasher.finish()
183}
184
185pub fn kolmogorov_proxy(content: &str) -> f64 {
188 if content.is_empty() {
189 return 1.0;
190 }
191 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
192 encoder.write_all(content.as_bytes()).ok();
193 let compressed = encoder.finish().unwrap_or_default();
194 compressed.len() as f64 / content.len() as f64
195}
196
197#[derive(Debug, Clone, Copy, PartialEq, Eq)]
199pub enum CompressibilityClass {
200 High,
201 Medium,
202 Low,
203}
204
205impl CompressibilityClass {
206 pub fn label(&self) -> &'static str {
208 match self {
209 Self::High => "high (K<0.3)",
210 Self::Medium => "medium (0.3≤K<0.6)",
211 Self::Low => "low (K≥0.6)",
212 }
213 }
214}
215
216pub fn compressibility_class(content: &str) -> CompressibilityClass {
218 let k = kolmogorov_proxy(content);
219 if k < 0.3 {
220 CompressibilityClass::High
221 } else if k < 0.6 {
222 CompressibilityClass::Medium
223 } else {
224 CompressibilityClass::Low
225 }
226}
227
228pub fn entropy_compress(content: &str) -> EntropyResult {
230 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
231}
232
233pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
235 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
236 let before_lines = content.lines().count() as u32;
237 let result =
238 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
239 let after_lines = result.output.lines().count() as u32;
240
241 if before_lines != after_lines {
242 super::events::emit(super::events::EventKind::Compression {
243 path: path.to_string(),
244 before_lines,
245 after_lines,
246 strategy: "entropy_adaptive".to_string(),
247 kept_line_count: after_lines,
248 removed_line_count: before_lines.saturating_sub(after_lines),
249 });
250 }
251
252 result
253}
254
255fn entropy_compress_with_thresholds(
256 content: &str,
257 entropy_threshold: f64,
258 jaccard_threshold: f64,
259) -> EntropyResult {
260 let original_tokens = count_tokens(content);
261 let mut lines: Vec<&str> = content.lines().collect();
262 let mut techniques = Vec::new();
263
264 let original_count = lines.len();
265 lines.retain(|line| {
266 let trimmed = line.trim();
267 super::surprise::should_keep_line(trimmed, entropy_threshold)
268 });
269 let removed = original_count - lines.len();
270 if removed > 0 {
271 techniques.push(format!(
272 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
273 ));
274 }
275
276 let blocks = extract_blocks(&lines);
277 let groups = find_pattern_groups(&blocks, jaccard_threshold);
278 let mut dedup_count = 0;
279 for group in &groups {
280 if group.len() > 1 {
281 dedup_count += group.len() - 1;
282 }
283 }
284 if dedup_count > 0 {
285 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
286 }
287
288 let mut result: Vec<String> = Vec::new();
289 let mut skip_indices: HashSet<usize> = HashSet::new();
290 for group in &groups {
291 if group.len() > 1 {
292 for &idx in &group[1..] {
293 skip_indices.insert(idx);
294 }
295 }
296 }
297 for (i, line) in lines.iter().enumerate() {
298 if !skip_indices.contains(&i) {
299 result.push(line.to_string());
300 }
301 }
302
303 let mut collapsed = Vec::new();
304 let mut blank_count = 0;
305 for line in &result {
306 if line.trim().is_empty() {
307 blank_count += 1;
308 if blank_count <= 1 {
309 collapsed.push(line.clone());
310 }
311 } else {
312 blank_count = 0;
313 collapsed.push(line.clone());
314 }
315 }
316
317 let output = collapsed.join("\n");
318 let compressed_tokens = count_tokens(&output);
319
320 let final_output = if compressed_tokens > original_tokens {
321 content.to_string()
322 } else {
323 output
324 };
325 let final_tokens = if compressed_tokens > original_tokens {
326 original_tokens
327 } else {
328 compressed_tokens
329 };
330
331 EntropyResult {
332 output: final_output,
333 original_tokens,
334 compressed_tokens: final_tokens,
335 techniques,
336 }
337}
338
339#[derive(Debug)]
341pub struct EntropyAnalysis {
342 pub avg_entropy: f64,
343 pub low_entropy_count: usize,
344 pub high_entropy_count: usize,
345 pub total_lines: usize,
346}
347
348pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
350 let lines: Vec<&str> = content.lines().collect();
351 let total = lines.len();
352 let mut sum = 0.0;
353 let mut low = 0;
354 let mut high = 0;
355 let mut counted = 0;
356
357 for line in &lines {
358 let trimmed = line.trim();
359 if trimmed.is_empty() {
360 continue;
361 }
362 let h = token_entropy(trimmed);
363 sum += h;
364 counted += 1;
365 if h < BPE_ENTROPY_THRESHOLD {
366 low += 1;
367 }
368 if h > 3.0 {
369 high += 1;
370 }
371 }
372
373 EntropyAnalysis {
374 avg_entropy: if counted > 0 {
375 sum / counted as f64
376 } else {
377 0.0
378 },
379 low_entropy_count: low,
380 high_entropy_count: high,
381 total_lines: total,
382 }
383}
384
385struct Block {
386 content: String,
387}
388
389fn extract_blocks(lines: &[&str]) -> Vec<Block> {
390 let mut blocks = Vec::new();
391 let mut current = String::new();
392
393 for line in lines {
394 let trimmed = line.trim();
395 if trimmed.is_empty() && !current.is_empty() {
396 blocks.push(Block {
397 content: current.clone(),
398 });
399 current.clear();
400 } else if !trimmed.is_empty() {
401 current.push_str(trimmed);
402 current.push('\n');
403 }
404 }
405
406 if !current.is_empty() {
407 blocks.push(Block { content: current });
408 }
409
410 blocks
411}
412
413fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
414 let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
418 let sizes: Vec<usize> = sets.iter().map(std::collections::HashSet::len).collect();
419
420 let mut groups: Vec<Vec<usize>> = Vec::new();
421 let mut assigned: HashSet<usize> = HashSet::new();
422
423 for i in 0..blocks.len() {
424 if assigned.contains(&i) {
425 continue;
426 }
427 let mut group = vec![i];
428 for j in (i + 1)..blocks.len() {
429 if assigned.contains(&j) {
430 continue;
431 }
432 let size_i = sizes[i];
433 let size_j = sizes[j];
434 let min_sz = size_i.min(size_j);
435 let max_sz = size_i.max(size_j);
436 if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
437 continue;
438 }
439 let inter = sets[i].intersection(&sets[j]).count();
440 let union = size_i + size_j - inter;
441 if union > 0 && (inter as f64 / union as f64) >= threshold {
442 group.push(j);
443 assigned.insert(j);
444 }
445 }
446 if group.len() > 1 {
447 assigned.insert(i);
448 }
449 groups.push(group);
450 }
451
452 groups
453}
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458
459 #[test]
460 fn shannon_entropy_empty_is_zero() {
461 assert_eq!(shannon_entropy(""), 0.0);
462 }
463
464 #[test]
465 fn shannon_entropy_single_char() {
466 assert_eq!(shannon_entropy("aaaa"), 0.0);
467 }
468
469 #[test]
470 fn shannon_entropy_high_for_varied_text() {
471 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
472 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
473 assert!(
474 shannon_entropy(varied) > shannon_entropy(uniform),
475 "varied text should have higher entropy"
476 );
477 }
478
479 #[test]
480 fn jaccard_identical_is_one() {
481 let sim = jaccard_similarity("hello world", "hello world");
482 assert!((sim - 1.0).abs() < f64::EPSILON);
483 }
484
485 #[test]
486 fn jaccard_disjoint_is_zero() {
487 let sim = jaccard_similarity("abc", "xyz");
488 assert_eq!(sim, 0.0);
489 }
490
491 #[test]
492 fn jaccard_partial_overlap() {
493 let sim = jaccard_similarity("hello world", "hello rust");
494 assert!(sim > 0.0 && sim < 1.0);
495 }
496
497 #[test]
498 fn entropy_compress_produces_output() {
499 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
500 let result = entropy_compress(content);
501 assert!(!result.output.is_empty(), "should produce non-empty output");
502 assert!(result.compressed_tokens <= result.original_tokens);
503 }
504
505 #[test]
506 fn entropy_result_savings() {
507 let r = EntropyResult {
508 output: "short".to_string(),
509 original_tokens: 100,
510 compressed_tokens: 60,
511 techniques: vec!["test".to_string()],
512 };
513 assert!((r.savings_percent() - 40.0).abs() < 0.1);
514 }
515
516 #[test]
517 fn entropy_result_zero_original() {
518 let r = EntropyResult {
519 output: String::new(),
520 original_tokens: 0,
521 compressed_tokens: 0,
522 techniques: vec![],
523 };
524 assert_eq!(r.savings_percent(), 0.0);
525 }
526
527 #[test]
528 fn token_entropy_empty_is_zero() {
529 assert_eq!(token_entropy(""), 0.0);
530 }
531
532 #[test]
533 fn token_entropy_single_repeated_token() {
534 assert_eq!(token_entropy("}"), 0.0);
535 }
536
537 #[test]
538 fn token_entropy_higher_for_diverse_code() {
539 let diverse = "let result = compute_something(x, y, z);";
540 let repetitive = "aaaa aaaa aaaa aaaa";
541 assert!(
542 token_entropy(diverse) > token_entropy(repetitive),
543 "diverse code should have higher BPE token entropy"
544 );
545 }
546
547 #[test]
548 fn token_entropy_vs_char_entropy_differ() {
549 let code = "fn main() { println!(\"hello world\"); }";
550 let te = token_entropy(code);
551 let ce = shannon_entropy(code);
552 assert!(te != ce, "BPE and char entropy should differ for code");
553 }
554
555 #[test]
556 fn ngram_jaccard_preserves_order() {
557 let a = "a b c d";
558 let b = "d c b a";
559 let word_j = jaccard_similarity(a, b);
560 let ngram_j = ngram_jaccard(a, b, 2);
561 assert!(
562 ngram_j < word_j,
563 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
564 );
565 }
566
567 #[test]
568 fn ngram_jaccard_identical_is_one() {
569 let text = "fn main() { println!(\"hello\"); }";
570 let j = ngram_jaccard(text, text, 2);
571 assert!((j - 1.0).abs() < f64::EPSILON);
572 }
573
574 #[test]
575 fn ngram_jaccard_disjoint_is_zero() {
576 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
577 assert_eq!(j, 0.0);
578 }
579
580 #[test]
581 fn minhash_approximates_jaccard() {
582 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
583 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
584 let exact = ngram_jaccard(a, b, 2);
585 let sig_a = minhash_signature(a, 2, 128);
586 let sig_b = minhash_signature(b, 2, 128);
587 let approx = minhash_similarity(&sig_a, &sig_b);
588 assert!(
589 (exact - approx).abs() < 0.2,
590 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
591 );
592 }
593
594 #[test]
595 fn minhash_empty_text() {
596 let sig = minhash_signature("", 2, 64);
597 assert!(sig.iter().all(|&v| v == u64::MAX));
598 }
599
600 #[test]
601 fn kolmogorov_empty_is_one() {
602 assert_eq!(kolmogorov_proxy(""), 1.0);
603 }
604
605 #[test]
606 fn kolmogorov_repetitive_is_low() {
607 let repetitive = "aaa\n".repeat(1000);
608 let k = kolmogorov_proxy(&repetitive);
609 assert!(
610 k < 0.1,
611 "highly repetitive text should compress well: K={k}"
612 );
613 }
614
615 #[test]
616 fn kolmogorov_diverse_is_higher() {
617 let repetitive = "aaa\n".repeat(500);
618 let diverse = (0..500)
619 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
620 .collect::<Vec<_>>()
621 .join("\n");
622 assert!(
623 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
624 "diverse content should have higher K than repetitive"
625 );
626 }
627
628 #[test]
629 fn compressibility_class_repetitive_is_high() {
630 let text = "use std::io;\n".repeat(200);
631 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
632 }
633
634 #[test]
635 fn kolmogorov_diverse_higher_than_repetitive() {
636 let rep = "test\n".repeat(500);
637 let diverse = (0..500)
638 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
639 .collect::<Vec<_>>()
640 .join("\n");
641 assert!(
642 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
643 "diverse content should have higher K"
644 );
645 }
646}