1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12#[derive(Debug)]
14pub struct EntropyResult {
15 pub output: String,
16 pub original_tokens: usize,
17 pub compressed_tokens: usize,
18 pub techniques: Vec<String>,
19}
20
21impl EntropyResult {
22 pub fn savings_percent(&self) -> f64 {
24 if self.original_tokens == 0 {
25 return 0.0;
26 }
27 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28 (saved as f64 / self.original_tokens as f64) * 100.0
29 }
30}
31
32pub fn shannon_entropy(text: &str) -> f64 {
34 if text.is_empty() {
35 return 0.0;
36 }
37 let mut freq: HashMap<char, usize> = HashMap::new();
38 let total = text.chars().count();
39
40 for c in text.chars() {
41 *freq.entry(c).or_default() += 1;
42 }
43
44 freq.values().fold(0.0_f64, |acc, &count| {
45 let p = count as f64 / total as f64;
46 acc - p * p.log2()
47 })
48}
49
50pub fn token_entropy_from_ids(tokens: &[u32]) -> f64 {
52 if tokens.is_empty() {
53 return 0.0;
54 }
55 let total = tokens.len();
56 let mut freq: HashMap<u32, usize> = HashMap::new();
57 for &t in tokens {
58 *freq.entry(t).or_default() += 1;
59 }
60 freq.values().fold(0.0_f64, |acc, &count| {
61 let p = count as f64 / total as f64;
62 acc - p * p.log2()
63 })
64}
65
66pub fn token_entropy(text: &str) -> f64 {
69 let tokens = encode_tokens(text);
70 token_entropy_from_ids(&tokens)
71}
72
73pub fn normalized_token_entropy_from_ids(tokens: &[u32]) -> f64 {
75 if tokens.is_empty() {
76 return 0.0;
77 }
78 let total = tokens.len();
79 let mut freq: HashMap<u32, usize> = HashMap::new();
80 for &t in tokens {
81 *freq.entry(t).or_default() += 1;
82 }
83 let n_unique = freq.len();
84 if n_unique <= 1 {
85 return 0.0;
86 }
87 let h = freq.values().fold(0.0_f64, |acc, &count| {
88 let p = count as f64 / total as f64;
89 acc - p * p.log2()
90 });
91 let h_max = (n_unique as f64).log2();
92 h / h_max
93}
94
95pub fn normalized_token_entropy(text: &str) -> f64 {
99 let tokens = encode_tokens(text);
100 normalized_token_entropy_from_ids(&tokens)
101}
102
103pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
105 let set_a: HashSet<&str> = a.split_whitespace().collect();
106 let set_b: HashSet<&str> = b.split_whitespace().collect();
107
108 let intersection = set_a.intersection(&set_b).count();
109 let union = set_a.union(&set_b).count();
110
111 if union == 0 {
112 return 0.0;
113 }
114 intersection as f64 / union as f64
115}
116
117pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
119 let set_a = ngram_set(a, n);
120 let set_b = ngram_set(b, n);
121
122 let intersection = set_a.intersection(&set_b).count();
123 let union = set_a.union(&set_b).count();
124
125 if union == 0 {
126 return 0.0;
127 }
128 intersection as f64 / union as f64
129}
130
131fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
132 let words: Vec<&str> = text.split_whitespace().collect();
133 if words.len() < n {
134 let mut set = HashSet::new();
135 if !words.is_empty() {
136 set.insert(words.iter().map(std::string::ToString::to_string).collect());
137 }
138 return set;
139 }
140 words
141 .windows(n)
142 .map(|w| w.iter().map(std::string::ToString::to_string).collect())
143 .collect()
144}
145
146pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
149 let ngrams = ngram_set(text, n);
150 if ngrams.is_empty() {
151 return vec![u64::MAX; k];
152 }
153 let mut signature = vec![u64::MAX; k];
154 for ngram in &ngrams {
155 for (i, min) in signature.iter_mut().enumerate() {
156 let h = hash_with_seed(ngram, i as u64);
157 if h < *min {
158 *min = h;
159 }
160 }
161 }
162 signature
163}
164
165pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
167 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
168 return 0.0;
169 }
170 let matches = sig_a
171 .iter()
172 .zip(sig_b.iter())
173 .filter(|(a, b)| a == b)
174 .count();
175 matches as f64 / sig_a.len() as f64
176}
177
178fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
179 let mut hasher = std::collections::hash_map::DefaultHasher::new();
180 seed.hash(&mut hasher);
181 value.hash(&mut hasher);
182 hasher.finish()
183}
184
185pub fn kolmogorov_proxy(content: &str) -> f64 {
188 if content.is_empty() {
189 return 1.0;
190 }
191 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
192 encoder.write_all(content.as_bytes()).ok();
193 let compressed = encoder.finish().unwrap_or_default();
194 compressed.len() as f64 / content.len() as f64
195}
196
197#[derive(Debug, Clone, Copy, PartialEq, Eq)]
199pub enum CompressibilityClass {
200 High,
201 Medium,
202 Low,
203}
204
205impl CompressibilityClass {
206 pub fn label(&self) -> &'static str {
208 match self {
209 Self::High => "high (K<0.3)",
210 Self::Medium => "medium (0.3≤K<0.6)",
211 Self::Low => "low (K≥0.6)",
212 }
213 }
214}
215
216pub fn compressibility_class(content: &str) -> CompressibilityClass {
218 let k = kolmogorov_proxy(content);
219 if k < 0.3 {
220 CompressibilityClass::High
221 } else if k < 0.6 {
222 CompressibilityClass::Medium
223 } else {
224 CompressibilityClass::Low
225 }
226}
227
228pub fn entropy_compress(content: &str) -> EntropyResult {
230 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
231}
232
233pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
235 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
236 let before_lines = content.lines().count() as u32;
237 let result =
238 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
239 let after_lines = result.output.lines().count() as u32;
240
241 if before_lines != after_lines {
242 super::events::emit(super::events::EventKind::Compression {
243 path: path.to_string(),
244 before_lines,
245 after_lines,
246 strategy: "entropy_adaptive".to_string(),
247 kept_line_count: after_lines,
248 removed_line_count: before_lines.saturating_sub(after_lines),
249 });
250 }
251
252 result
253}
254
255fn entropy_compress_with_thresholds(
256 content: &str,
257 entropy_threshold: f64,
258 jaccard_threshold: f64,
259) -> EntropyResult {
260 let original_tokens = count_tokens(content);
261 let mut lines: Vec<&str> = content.lines().collect();
262 let mut techniques = Vec::new();
263
264 let original_count = lines.len();
265 lines.retain(|line| {
266 let trimmed = line.trim();
267 super::surprise::should_keep_line(trimmed, entropy_threshold)
268 });
269 let removed = original_count - lines.len();
270 if removed > 0 {
271 techniques.push(format!(
272 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
273 ));
274 }
275
276 let blocks = extract_blocks(&lines);
277 let groups = find_pattern_groups(&blocks, jaccard_threshold);
278 let mut dedup_count = 0;
279 for group in &groups {
280 if group.len() > 1 {
281 dedup_count += group.len() - 1;
282 }
283 }
284 if dedup_count > 0 {
285 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
286 }
287
288 let mut result: Vec<String> = Vec::new();
289 let mut skip_indices: HashSet<usize> = HashSet::new();
290 for group in &groups {
291 if group.len() > 1 {
292 for &idx in &group[1..] {
293 skip_indices.insert(idx);
294 }
295 }
296 }
297 for (i, line) in lines.iter().enumerate() {
298 if !skip_indices.contains(&i) {
299 result.push(line.to_string());
300 }
301 }
302
303 let mut collapsed = Vec::new();
304 let mut blank_count = 0;
305 for line in &result {
306 if line.trim().is_empty() {
307 blank_count += 1;
308 if blank_count <= 1 {
309 collapsed.push(line.clone());
310 }
311 } else {
312 blank_count = 0;
313 collapsed.push(line.clone());
314 }
315 }
316
317 let output = collapsed.join("\n");
318 let compressed_tokens = count_tokens(&output);
319
320 EntropyResult {
321 output,
322 original_tokens,
323 compressed_tokens,
324 techniques,
325 }
326}
327
328#[derive(Debug)]
330pub struct EntropyAnalysis {
331 pub avg_entropy: f64,
332 pub low_entropy_count: usize,
333 pub high_entropy_count: usize,
334 pub total_lines: usize,
335}
336
337pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
339 let lines: Vec<&str> = content.lines().collect();
340 let total = lines.len();
341 let mut sum = 0.0;
342 let mut low = 0;
343 let mut high = 0;
344 let mut counted = 0;
345
346 for line in &lines {
347 let trimmed = line.trim();
348 if trimmed.is_empty() {
349 continue;
350 }
351 let h = token_entropy(trimmed);
352 sum += h;
353 counted += 1;
354 if h < BPE_ENTROPY_THRESHOLD {
355 low += 1;
356 }
357 if h > 3.0 {
358 high += 1;
359 }
360 }
361
362 EntropyAnalysis {
363 avg_entropy: if counted > 0 {
364 sum / counted as f64
365 } else {
366 0.0
367 },
368 low_entropy_count: low,
369 high_entropy_count: high,
370 total_lines: total,
371 }
372}
373
374struct Block {
375 content: String,
376}
377
378fn extract_blocks(lines: &[&str]) -> Vec<Block> {
379 let mut blocks = Vec::new();
380 let mut current = String::new();
381
382 for line in lines {
383 let trimmed = line.trim();
384 if trimmed.is_empty() && !current.is_empty() {
385 blocks.push(Block {
386 content: current.clone(),
387 });
388 current.clear();
389 } else if !trimmed.is_empty() {
390 current.push_str(trimmed);
391 current.push('\n');
392 }
393 }
394
395 if !current.is_empty() {
396 blocks.push(Block { content: current });
397 }
398
399 blocks
400}
401
402fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
403 let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
407 let sizes: Vec<usize> = sets.iter().map(std::collections::HashSet::len).collect();
408
409 let mut groups: Vec<Vec<usize>> = Vec::new();
410 let mut assigned: HashSet<usize> = HashSet::new();
411
412 for i in 0..blocks.len() {
413 if assigned.contains(&i) {
414 continue;
415 }
416 let mut group = vec![i];
417 for j in (i + 1)..blocks.len() {
418 if assigned.contains(&j) {
419 continue;
420 }
421 let size_i = sizes[i];
422 let size_j = sizes[j];
423 let min_sz = size_i.min(size_j);
424 let max_sz = size_i.max(size_j);
425 if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
426 continue;
427 }
428 let inter = sets[i].intersection(&sets[j]).count();
429 let union = size_i + size_j - inter;
430 if union > 0 && (inter as f64 / union as f64) >= threshold {
431 group.push(j);
432 assigned.insert(j);
433 }
434 }
435 if group.len() > 1 {
436 assigned.insert(i);
437 }
438 groups.push(group);
439 }
440
441 groups
442}
443
444#[cfg(test)]
445mod tests {
446 use super::*;
447
448 #[test]
449 fn shannon_entropy_empty_is_zero() {
450 assert_eq!(shannon_entropy(""), 0.0);
451 }
452
453 #[test]
454 fn shannon_entropy_single_char() {
455 assert_eq!(shannon_entropy("aaaa"), 0.0);
456 }
457
458 #[test]
459 fn shannon_entropy_high_for_varied_text() {
460 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
461 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
462 assert!(
463 shannon_entropy(varied) > shannon_entropy(uniform),
464 "varied text should have higher entropy"
465 );
466 }
467
468 #[test]
469 fn jaccard_identical_is_one() {
470 let sim = jaccard_similarity("hello world", "hello world");
471 assert!((sim - 1.0).abs() < f64::EPSILON);
472 }
473
474 #[test]
475 fn jaccard_disjoint_is_zero() {
476 let sim = jaccard_similarity("abc", "xyz");
477 assert_eq!(sim, 0.0);
478 }
479
480 #[test]
481 fn jaccard_partial_overlap() {
482 let sim = jaccard_similarity("hello world", "hello rust");
483 assert!(sim > 0.0 && sim < 1.0);
484 }
485
486 #[test]
487 fn entropy_compress_produces_output() {
488 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
489 let result = entropy_compress(content);
490 assert!(!result.output.is_empty(), "should produce non-empty output");
491 assert!(result.compressed_tokens <= result.original_tokens);
492 }
493
494 #[test]
495 fn entropy_result_savings() {
496 let r = EntropyResult {
497 output: "short".to_string(),
498 original_tokens: 100,
499 compressed_tokens: 60,
500 techniques: vec!["test".to_string()],
501 };
502 assert!((r.savings_percent() - 40.0).abs() < 0.1);
503 }
504
505 #[test]
506 fn entropy_result_zero_original() {
507 let r = EntropyResult {
508 output: String::new(),
509 original_tokens: 0,
510 compressed_tokens: 0,
511 techniques: vec![],
512 };
513 assert_eq!(r.savings_percent(), 0.0);
514 }
515
516 #[test]
517 fn token_entropy_empty_is_zero() {
518 assert_eq!(token_entropy(""), 0.0);
519 }
520
521 #[test]
522 fn token_entropy_single_repeated_token() {
523 assert_eq!(token_entropy("}"), 0.0);
524 }
525
526 #[test]
527 fn token_entropy_higher_for_diverse_code() {
528 let diverse = "let result = compute_something(x, y, z);";
529 let repetitive = "aaaa aaaa aaaa aaaa";
530 assert!(
531 token_entropy(diverse) > token_entropy(repetitive),
532 "diverse code should have higher BPE token entropy"
533 );
534 }
535
536 #[test]
537 fn token_entropy_vs_char_entropy_differ() {
538 let code = "fn main() { println!(\"hello world\"); }";
539 let te = token_entropy(code);
540 let ce = shannon_entropy(code);
541 assert!(te != ce, "BPE and char entropy should differ for code");
542 }
543
544 #[test]
545 fn ngram_jaccard_preserves_order() {
546 let a = "a b c d";
547 let b = "d c b a";
548 let word_j = jaccard_similarity(a, b);
549 let ngram_j = ngram_jaccard(a, b, 2);
550 assert!(
551 ngram_j < word_j,
552 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
553 );
554 }
555
556 #[test]
557 fn ngram_jaccard_identical_is_one() {
558 let text = "fn main() { println!(\"hello\"); }";
559 let j = ngram_jaccard(text, text, 2);
560 assert!((j - 1.0).abs() < f64::EPSILON);
561 }
562
563 #[test]
564 fn ngram_jaccard_disjoint_is_zero() {
565 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
566 assert_eq!(j, 0.0);
567 }
568
569 #[test]
570 fn minhash_approximates_jaccard() {
571 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
572 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
573 let exact = ngram_jaccard(a, b, 2);
574 let sig_a = minhash_signature(a, 2, 128);
575 let sig_b = minhash_signature(b, 2, 128);
576 let approx = minhash_similarity(&sig_a, &sig_b);
577 assert!(
578 (exact - approx).abs() < 0.2,
579 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
580 );
581 }
582
583 #[test]
584 fn minhash_empty_text() {
585 let sig = minhash_signature("", 2, 64);
586 assert!(sig.iter().all(|&v| v == u64::MAX));
587 }
588
589 #[test]
590 fn kolmogorov_empty_is_one() {
591 assert_eq!(kolmogorov_proxy(""), 1.0);
592 }
593
594 #[test]
595 fn kolmogorov_repetitive_is_low() {
596 let repetitive = "aaa\n".repeat(1000);
597 let k = kolmogorov_proxy(&repetitive);
598 assert!(
599 k < 0.1,
600 "highly repetitive text should compress well: K={k}"
601 );
602 }
603
604 #[test]
605 fn kolmogorov_diverse_is_higher() {
606 let repetitive = "aaa\n".repeat(500);
607 let diverse = (0..500)
608 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
609 .collect::<Vec<_>>()
610 .join("\n");
611 assert!(
612 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
613 "diverse content should have higher K than repetitive"
614 );
615 }
616
617 #[test]
618 fn compressibility_class_repetitive_is_high() {
619 let text = "use std::io;\n".repeat(200);
620 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
621 }
622
623 #[test]
624 fn kolmogorov_diverse_higher_than_repetitive() {
625 let rep = "test\n".repeat(500);
626 let diverse = (0..500)
627 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
628 .collect::<Vec<_>>()
629 .join("\n");
630 assert!(
631 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
632 "diverse content should have higher K"
633 );
634 }
635}