1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11#[allow(dead_code)]
12const MINHASH_NUM_HASHES: usize = 128;
13
14#[derive(Debug)]
15pub struct EntropyResult {
16 pub output: String,
17 pub original_tokens: usize,
18 pub compressed_tokens: usize,
19 pub techniques: Vec<String>,
20}
21
22impl EntropyResult {
23 pub fn savings_percent(&self) -> f64 {
24 if self.original_tokens == 0 {
25 return 0.0;
26 }
27 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28 (saved as f64 / self.original_tokens as f64) * 100.0
29 }
30}
31
32#[allow(dead_code)]
33pub fn shannon_entropy(text: &str) -> f64 {
34 if text.is_empty() {
35 return 0.0;
36 }
37 let mut freq: HashMap<char, usize> = HashMap::new();
38 let total = text.chars().count();
39
40 for c in text.chars() {
41 *freq.entry(c).or_default() += 1;
42 }
43
44 freq.values().fold(0.0_f64, |acc, &count| {
45 let p = count as f64 / total as f64;
46 acc - p * p.log2()
47 })
48}
49
50pub fn token_entropy(text: &str) -> f64 {
53 let tokens = encode_tokens(text);
54 if tokens.is_empty() {
55 return 0.0;
56 }
57 let total = tokens.len();
58 let mut freq: HashMap<u32, usize> = HashMap::new();
59 for &t in &tokens {
60 *freq.entry(t).or_default() += 1;
61 }
62 freq.values().fold(0.0_f64, |acc, &count| {
63 let p = count as f64 / total as f64;
64 acc - p * p.log2()
65 })
66}
67
68pub fn normalized_token_entropy(text: &str) -> f64 {
72 let tokens = encode_tokens(text);
73 if tokens.is_empty() {
74 return 0.0;
75 }
76 let total = tokens.len();
77 let mut freq: HashMap<u32, usize> = HashMap::new();
78 for &t in &tokens {
79 *freq.entry(t).or_default() += 1;
80 }
81 let n_unique = freq.len();
82 if n_unique <= 1 {
83 return 0.0;
84 }
85 let h = freq.values().fold(0.0_f64, |acc, &count| {
86 let p = count as f64 / total as f64;
87 acc - p * p.log2()
88 });
89 let h_max = (n_unique as f64).log2();
90 h / h_max
91}
92
93#[allow(dead_code)]
94pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
95 let set_a: HashSet<&str> = a.split_whitespace().collect();
96 let set_b: HashSet<&str> = b.split_whitespace().collect();
97
98 let intersection = set_a.intersection(&set_b).count();
99 let union = set_a.union(&set_b).count();
100
101 if union == 0 {
102 return 0.0;
103 }
104 intersection as f64 / union as f64
105}
106
107pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
109 let set_a = ngram_set(a, n);
110 let set_b = ngram_set(b, n);
111
112 let intersection = set_a.intersection(&set_b).count();
113 let union = set_a.union(&set_b).count();
114
115 if union == 0 {
116 return 0.0;
117 }
118 intersection as f64 / union as f64
119}
120
121fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
122 let words: Vec<&str> = text.split_whitespace().collect();
123 if words.len() < n {
124 let mut set = HashSet::new();
125 if !words.is_empty() {
126 set.insert(words.iter().map(|w| w.to_string()).collect());
127 }
128 return set;
129 }
130 words
131 .windows(n)
132 .map(|w| w.iter().map(|s| s.to_string()).collect())
133 .collect()
134}
135
136#[allow(dead_code)]
139pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
140 let ngrams = ngram_set(text, n);
141 if ngrams.is_empty() {
142 return vec![u64::MAX; k];
143 }
144 let mut signature = vec![u64::MAX; k];
145 for ngram in &ngrams {
146 for (i, min) in signature.iter_mut().enumerate() {
147 let h = hash_with_seed(ngram, i as u64);
148 if h < *min {
149 *min = h;
150 }
151 }
152 }
153 signature
154}
155
156#[allow(dead_code)]
158pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
159 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
160 return 0.0;
161 }
162 let matches = sig_a
163 .iter()
164 .zip(sig_b.iter())
165 .filter(|(a, b)| a == b)
166 .count();
167 matches as f64 / sig_a.len() as f64
168}
169
170#[allow(dead_code)]
171fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
172 let mut hasher = std::collections::hash_map::DefaultHasher::new();
173 seed.hash(&mut hasher);
174 value.hash(&mut hasher);
175 hasher.finish()
176}
177
178pub fn kolmogorov_proxy(content: &str) -> f64 {
181 if content.is_empty() {
182 return 1.0;
183 }
184 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
185 encoder.write_all(content.as_bytes()).ok();
186 let compressed = encoder.finish().unwrap_or_default();
187 compressed.len() as f64 / content.len() as f64
188}
189
190#[derive(Debug, Clone, Copy, PartialEq, Eq)]
191pub enum CompressibilityClass {
192 High,
193 Medium,
194 Low,
195}
196
197impl CompressibilityClass {
198 pub fn label(&self) -> &'static str {
199 match self {
200 Self::High => "high (K<0.3)",
201 Self::Medium => "medium (0.3≤K<0.6)",
202 Self::Low => "low (K≥0.6)",
203 }
204 }
205}
206
207pub fn compressibility_class(content: &str) -> CompressibilityClass {
209 let k = kolmogorov_proxy(content);
210 if k < 0.3 {
211 CompressibilityClass::High
212 } else if k < 0.6 {
213 CompressibilityClass::Medium
214 } else {
215 CompressibilityClass::Low
216 }
217}
218
219pub fn entropy_compress(content: &str) -> EntropyResult {
220 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
221}
222
223pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
224 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
225 let before_lines = content.lines().count() as u32;
226 let result =
227 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
228 let after_lines = result.output.lines().count() as u32;
229
230 if before_lines != after_lines {
231 super::events::emit(super::events::EventKind::Compression {
232 path: path.to_string(),
233 before_lines,
234 after_lines,
235 strategy: "entropy_adaptive".to_string(),
236 kept_line_count: after_lines,
237 removed_line_count: before_lines.saturating_sub(after_lines),
238 });
239 }
240
241 result
242}
243
244fn entropy_compress_with_thresholds(
245 content: &str,
246 entropy_threshold: f64,
247 jaccard_threshold: f64,
248) -> EntropyResult {
249 let original_tokens = count_tokens(content);
250 let mut lines: Vec<&str> = content.lines().collect();
251 let mut techniques = Vec::new();
252
253 let original_count = lines.len();
254 lines.retain(|line| {
255 let trimmed = line.trim();
256 super::surprise::should_keep_line(trimmed, entropy_threshold)
257 });
258 let removed = original_count - lines.len();
259 if removed > 0 {
260 techniques.push(format!(
261 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
262 ));
263 }
264
265 let blocks = extract_blocks(&lines);
266 let groups = find_pattern_groups(&blocks, jaccard_threshold);
267 let mut dedup_count = 0;
268 for group in &groups {
269 if group.len() > 1 {
270 dedup_count += group.len() - 1;
271 }
272 }
273 if dedup_count > 0 {
274 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
275 }
276
277 let mut result: Vec<String> = Vec::new();
278 let mut skip_indices: HashSet<usize> = HashSet::new();
279 for group in &groups {
280 if group.len() > 1 {
281 for &idx in &group[1..] {
282 skip_indices.insert(idx);
283 }
284 }
285 }
286 for (i, line) in lines.iter().enumerate() {
287 if !skip_indices.contains(&i) {
288 result.push(line.to_string());
289 }
290 }
291
292 let mut collapsed = Vec::new();
293 let mut blank_count = 0;
294 for line in &result {
295 if line.trim().is_empty() {
296 blank_count += 1;
297 if blank_count <= 1 {
298 collapsed.push(line.clone());
299 }
300 } else {
301 blank_count = 0;
302 collapsed.push(line.clone());
303 }
304 }
305
306 let output = collapsed.join("\n");
307 let compressed_tokens = count_tokens(&output);
308
309 EntropyResult {
310 output,
311 original_tokens,
312 compressed_tokens,
313 techniques,
314 }
315}
316
317#[derive(Debug)]
318pub struct EntropyAnalysis {
319 pub avg_entropy: f64,
320 pub low_entropy_count: usize,
321 pub high_entropy_count: usize,
322 pub total_lines: usize,
323}
324
325pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
326 let lines: Vec<&str> = content.lines().collect();
327 let total = lines.len();
328 let mut sum = 0.0;
329 let mut low = 0;
330 let mut high = 0;
331 let mut counted = 0;
332
333 for line in &lines {
334 let trimmed = line.trim();
335 if trimmed.is_empty() {
336 continue;
337 }
338 let h = token_entropy(trimmed);
339 sum += h;
340 counted += 1;
341 if h < BPE_ENTROPY_THRESHOLD {
342 low += 1;
343 }
344 if h > 3.0 {
345 high += 1;
346 }
347 }
348
349 EntropyAnalysis {
350 avg_entropy: if counted > 0 {
351 sum / counted as f64
352 } else {
353 0.0
354 },
355 low_entropy_count: low,
356 high_entropy_count: high,
357 total_lines: total,
358 }
359}
360
361#[allow(dead_code)]
362struct Block {
363 start: usize,
364 content: String,
365}
366
367fn extract_blocks(lines: &[&str]) -> Vec<Block> {
368 let mut blocks = Vec::new();
369 let mut current = String::new();
370 let mut start = 0;
371
372 for (i, line) in lines.iter().enumerate() {
373 let trimmed = line.trim();
374 if trimmed.is_empty() && !current.is_empty() {
375 blocks.push(Block {
376 start,
377 content: current.clone(),
378 });
379 current.clear();
380 } else if !trimmed.is_empty() {
381 if current.is_empty() {
382 start = i;
383 }
384 current.push_str(trimmed);
385 current.push('\n');
386 }
387 }
388
389 if !current.is_empty() {
390 blocks.push(Block {
391 start,
392 content: current,
393 });
394 }
395
396 blocks
397}
398
399fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
400 let mut groups: Vec<Vec<usize>> = Vec::new();
401 let mut assigned: HashSet<usize> = HashSet::new();
402
403 for (i, block_a) in blocks.iter().enumerate() {
404 if assigned.contains(&i) {
405 continue;
406 }
407 let mut group = vec![i];
408 for (j, block_b) in blocks.iter().enumerate().skip(i + 1) {
409 if assigned.contains(&j) {
410 continue;
411 }
412 if ngram_jaccard(&block_a.content, &block_b.content, 2) >= threshold {
413 group.push(j);
414 assigned.insert(j);
415 }
416 }
417 if group.len() > 1 {
418 assigned.insert(i);
419 }
420 groups.push(group);
421 }
422
423 groups
424}
425
426#[cfg(test)]
427mod tests {
428 use super::*;
429
430 #[test]
431 fn shannon_entropy_empty_is_zero() {
432 assert_eq!(shannon_entropy(""), 0.0);
433 }
434
435 #[test]
436 fn shannon_entropy_single_char() {
437 assert_eq!(shannon_entropy("aaaa"), 0.0);
438 }
439
440 #[test]
441 fn shannon_entropy_high_for_varied_text() {
442 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
443 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
444 assert!(
445 shannon_entropy(varied) > shannon_entropy(uniform),
446 "varied text should have higher entropy"
447 );
448 }
449
450 #[test]
451 fn jaccard_identical_is_one() {
452 let sim = jaccard_similarity("hello world", "hello world");
453 assert!((sim - 1.0).abs() < f64::EPSILON);
454 }
455
456 #[test]
457 fn jaccard_disjoint_is_zero() {
458 let sim = jaccard_similarity("abc", "xyz");
459 assert_eq!(sim, 0.0);
460 }
461
462 #[test]
463 fn jaccard_partial_overlap() {
464 let sim = jaccard_similarity("hello world", "hello rust");
465 assert!(sim > 0.0 && sim < 1.0);
466 }
467
468 #[test]
469 fn entropy_compress_produces_output() {
470 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
471 let result = entropy_compress(content);
472 assert!(!result.output.is_empty(), "should produce non-empty output");
473 assert!(result.compressed_tokens <= result.original_tokens);
474 }
475
476 #[test]
477 fn entropy_result_savings() {
478 let r = EntropyResult {
479 output: "short".to_string(),
480 original_tokens: 100,
481 compressed_tokens: 60,
482 techniques: vec!["test".to_string()],
483 };
484 assert!((r.savings_percent() - 40.0).abs() < 0.1);
485 }
486
487 #[test]
488 fn entropy_result_zero_original() {
489 let r = EntropyResult {
490 output: String::new(),
491 original_tokens: 0,
492 compressed_tokens: 0,
493 techniques: vec![],
494 };
495 assert_eq!(r.savings_percent(), 0.0);
496 }
497
498 #[test]
499 fn token_entropy_empty_is_zero() {
500 assert_eq!(token_entropy(""), 0.0);
501 }
502
503 #[test]
504 fn token_entropy_single_repeated_token() {
505 assert_eq!(token_entropy("}"), 0.0);
506 }
507
508 #[test]
509 fn token_entropy_higher_for_diverse_code() {
510 let diverse = "let result = compute_something(x, y, z);";
511 let repetitive = "aaaa aaaa aaaa aaaa";
512 assert!(
513 token_entropy(diverse) > token_entropy(repetitive),
514 "diverse code should have higher BPE token entropy"
515 );
516 }
517
518 #[test]
519 fn token_entropy_vs_char_entropy_differ() {
520 let code = "fn main() { println!(\"hello world\"); }";
521 let te = token_entropy(code);
522 let ce = shannon_entropy(code);
523 assert!(te != ce, "BPE and char entropy should differ for code");
524 }
525
526 #[test]
527 fn ngram_jaccard_preserves_order() {
528 let a = "a b c d";
529 let b = "d c b a";
530 let word_j = jaccard_similarity(a, b);
531 let ngram_j = ngram_jaccard(a, b, 2);
532 assert!(
533 ngram_j < word_j,
534 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
535 );
536 }
537
538 #[test]
539 fn ngram_jaccard_identical_is_one() {
540 let text = "fn main() { println!(\"hello\"); }";
541 let j = ngram_jaccard(text, text, 2);
542 assert!((j - 1.0).abs() < f64::EPSILON);
543 }
544
545 #[test]
546 fn ngram_jaccard_disjoint_is_zero() {
547 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
548 assert_eq!(j, 0.0);
549 }
550
551 #[test]
552 fn minhash_approximates_jaccard() {
553 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
554 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
555 let exact = ngram_jaccard(a, b, 2);
556 let sig_a = minhash_signature(a, 2, MINHASH_NUM_HASHES);
557 let sig_b = minhash_signature(b, 2, MINHASH_NUM_HASHES);
558 let approx = minhash_similarity(&sig_a, &sig_b);
559 assert!(
560 (exact - approx).abs() < 0.2,
561 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
562 );
563 }
564
565 #[test]
566 fn minhash_empty_text() {
567 let sig = minhash_signature("", 2, 64);
568 assert!(sig.iter().all(|&v| v == u64::MAX));
569 }
570
571 #[test]
572 fn kolmogorov_empty_is_one() {
573 assert_eq!(kolmogorov_proxy(""), 1.0);
574 }
575
576 #[test]
577 fn kolmogorov_repetitive_is_low() {
578 let repetitive = "aaa\n".repeat(1000);
579 let k = kolmogorov_proxy(&repetitive);
580 assert!(
581 k < 0.1,
582 "highly repetitive text should compress well: K={k}"
583 );
584 }
585
586 #[test]
587 fn kolmogorov_diverse_is_higher() {
588 let repetitive = "aaa\n".repeat(500);
589 let diverse = (0..500)
590 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
591 .collect::<Vec<_>>()
592 .join("\n");
593 assert!(
594 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
595 "diverse content should have higher K than repetitive"
596 );
597 }
598
599 #[test]
600 fn compressibility_class_repetitive_is_high() {
601 let text = "use std::io;\n".repeat(200);
602 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
603 }
604
605 #[test]
606 fn kolmogorov_diverse_higher_than_repetitive() {
607 let rep = "test\n".repeat(500);
608 let diverse = (0..500)
609 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
610 .collect::<Vec<_>>()
611 .join("\n");
612 assert!(
613 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
614 "diverse content should have higher K"
615 );
616 }
617}