1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11
12#[derive(Debug)]
14pub struct EntropyResult {
15 pub output: String,
16 pub original_tokens: usize,
17 pub compressed_tokens: usize,
18 pub techniques: Vec<String>,
19}
20
21impl EntropyResult {
22 pub fn savings_percent(&self) -> f64 {
24 if self.original_tokens == 0 {
25 return 0.0;
26 }
27 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
28 (saved as f64 / self.original_tokens as f64) * 100.0
29 }
30}
31
32pub fn shannon_entropy(text: &str) -> f64 {
34 if text.is_empty() {
35 return 0.0;
36 }
37 let mut freq: HashMap<char, usize> = HashMap::new();
38 let total = text.chars().count();
39
40 for c in text.chars() {
41 *freq.entry(c).or_default() += 1;
42 }
43
44 freq.values().fold(0.0_f64, |acc, &count| {
45 let p = count as f64 / total as f64;
46 acc - p * p.log2()
47 })
48}
49
50pub fn token_entropy(text: &str) -> f64 {
53 let tokens = encode_tokens(text);
54 if tokens.is_empty() {
55 return 0.0;
56 }
57 let total = tokens.len();
58 let mut freq: HashMap<u32, usize> = HashMap::new();
59 for &t in &tokens {
60 *freq.entry(t).or_default() += 1;
61 }
62 freq.values().fold(0.0_f64, |acc, &count| {
63 let p = count as f64 / total as f64;
64 acc - p * p.log2()
65 })
66}
67
68pub fn normalized_token_entropy(text: &str) -> f64 {
72 let tokens = encode_tokens(text);
73 if tokens.is_empty() {
74 return 0.0;
75 }
76 let total = tokens.len();
77 let mut freq: HashMap<u32, usize> = HashMap::new();
78 for &t in &tokens {
79 *freq.entry(t).or_default() += 1;
80 }
81 let n_unique = freq.len();
82 if n_unique <= 1 {
83 return 0.0;
84 }
85 let h = freq.values().fold(0.0_f64, |acc, &count| {
86 let p = count as f64 / total as f64;
87 acc - p * p.log2()
88 });
89 let h_max = (n_unique as f64).log2();
90 h / h_max
91}
92
93pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
95 let set_a: HashSet<&str> = a.split_whitespace().collect();
96 let set_b: HashSet<&str> = b.split_whitespace().collect();
97
98 let intersection = set_a.intersection(&set_b).count();
99 let union = set_a.union(&set_b).count();
100
101 if union == 0 {
102 return 0.0;
103 }
104 intersection as f64 / union as f64
105}
106
107pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
109 let set_a = ngram_set(a, n);
110 let set_b = ngram_set(b, n);
111
112 let intersection = set_a.intersection(&set_b).count();
113 let union = set_a.union(&set_b).count();
114
115 if union == 0 {
116 return 0.0;
117 }
118 intersection as f64 / union as f64
119}
120
121fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
122 let words: Vec<&str> = text.split_whitespace().collect();
123 if words.len() < n {
124 let mut set = HashSet::new();
125 if !words.is_empty() {
126 set.insert(words.iter().map(std::string::ToString::to_string).collect());
127 }
128 return set;
129 }
130 words
131 .windows(n)
132 .map(|w| w.iter().map(std::string::ToString::to_string).collect())
133 .collect()
134}
135
136pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
139 let ngrams = ngram_set(text, n);
140 if ngrams.is_empty() {
141 return vec![u64::MAX; k];
142 }
143 let mut signature = vec![u64::MAX; k];
144 for ngram in &ngrams {
145 for (i, min) in signature.iter_mut().enumerate() {
146 let h = hash_with_seed(ngram, i as u64);
147 if h < *min {
148 *min = h;
149 }
150 }
151 }
152 signature
153}
154
155pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
157 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
158 return 0.0;
159 }
160 let matches = sig_a
161 .iter()
162 .zip(sig_b.iter())
163 .filter(|(a, b)| a == b)
164 .count();
165 matches as f64 / sig_a.len() as f64
166}
167
168fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
169 let mut hasher = std::collections::hash_map::DefaultHasher::new();
170 seed.hash(&mut hasher);
171 value.hash(&mut hasher);
172 hasher.finish()
173}
174
175pub fn kolmogorov_proxy(content: &str) -> f64 {
178 if content.is_empty() {
179 return 1.0;
180 }
181 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
182 encoder.write_all(content.as_bytes()).ok();
183 let compressed = encoder.finish().unwrap_or_default();
184 compressed.len() as f64 / content.len() as f64
185}
186
187#[derive(Debug, Clone, Copy, PartialEq, Eq)]
189pub enum CompressibilityClass {
190 High,
191 Medium,
192 Low,
193}
194
195impl CompressibilityClass {
196 pub fn label(&self) -> &'static str {
198 match self {
199 Self::High => "high (K<0.3)",
200 Self::Medium => "medium (0.3≤K<0.6)",
201 Self::Low => "low (K≥0.6)",
202 }
203 }
204}
205
206pub fn compressibility_class(content: &str) -> CompressibilityClass {
208 let k = kolmogorov_proxy(content);
209 if k < 0.3 {
210 CompressibilityClass::High
211 } else if k < 0.6 {
212 CompressibilityClass::Medium
213 } else {
214 CompressibilityClass::Low
215 }
216}
217
218pub fn entropy_compress(content: &str) -> EntropyResult {
220 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
221}
222
223pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
225 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
226 let before_lines = content.lines().count() as u32;
227 let result =
228 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
229 let after_lines = result.output.lines().count() as u32;
230
231 if before_lines != after_lines {
232 super::events::emit(super::events::EventKind::Compression {
233 path: path.to_string(),
234 before_lines,
235 after_lines,
236 strategy: "entropy_adaptive".to_string(),
237 kept_line_count: after_lines,
238 removed_line_count: before_lines.saturating_sub(after_lines),
239 });
240 }
241
242 result
243}
244
245fn entropy_compress_with_thresholds(
246 content: &str,
247 entropy_threshold: f64,
248 jaccard_threshold: f64,
249) -> EntropyResult {
250 let original_tokens = count_tokens(content);
251 let mut lines: Vec<&str> = content.lines().collect();
252 let mut techniques = Vec::new();
253
254 let original_count = lines.len();
255 lines.retain(|line| {
256 let trimmed = line.trim();
257 super::surprise::should_keep_line(trimmed, entropy_threshold)
258 });
259 let removed = original_count - lines.len();
260 if removed > 0 {
261 techniques.push(format!(
262 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
263 ));
264 }
265
266 let blocks = extract_blocks(&lines);
267 let groups = find_pattern_groups(&blocks, jaccard_threshold);
268 let mut dedup_count = 0;
269 for group in &groups {
270 if group.len() > 1 {
271 dedup_count += group.len() - 1;
272 }
273 }
274 if dedup_count > 0 {
275 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
276 }
277
278 let mut result: Vec<String> = Vec::new();
279 let mut skip_indices: HashSet<usize> = HashSet::new();
280 for group in &groups {
281 if group.len() > 1 {
282 for &idx in &group[1..] {
283 skip_indices.insert(idx);
284 }
285 }
286 }
287 for (i, line) in lines.iter().enumerate() {
288 if !skip_indices.contains(&i) {
289 result.push(line.to_string());
290 }
291 }
292
293 let mut collapsed = Vec::new();
294 let mut blank_count = 0;
295 for line in &result {
296 if line.trim().is_empty() {
297 blank_count += 1;
298 if blank_count <= 1 {
299 collapsed.push(line.clone());
300 }
301 } else {
302 blank_count = 0;
303 collapsed.push(line.clone());
304 }
305 }
306
307 let output = collapsed.join("\n");
308 let compressed_tokens = count_tokens(&output);
309
310 EntropyResult {
311 output,
312 original_tokens,
313 compressed_tokens,
314 techniques,
315 }
316}
317
318#[derive(Debug)]
320pub struct EntropyAnalysis {
321 pub avg_entropy: f64,
322 pub low_entropy_count: usize,
323 pub high_entropy_count: usize,
324 pub total_lines: usize,
325}
326
327pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
329 let lines: Vec<&str> = content.lines().collect();
330 let total = lines.len();
331 let mut sum = 0.0;
332 let mut low = 0;
333 let mut high = 0;
334 let mut counted = 0;
335
336 for line in &lines {
337 let trimmed = line.trim();
338 if trimmed.is_empty() {
339 continue;
340 }
341 let h = token_entropy(trimmed);
342 sum += h;
343 counted += 1;
344 if h < BPE_ENTROPY_THRESHOLD {
345 low += 1;
346 }
347 if h > 3.0 {
348 high += 1;
349 }
350 }
351
352 EntropyAnalysis {
353 avg_entropy: if counted > 0 {
354 sum / counted as f64
355 } else {
356 0.0
357 },
358 low_entropy_count: low,
359 high_entropy_count: high,
360 total_lines: total,
361 }
362}
363
364struct Block {
365 content: String,
366}
367
368fn extract_blocks(lines: &[&str]) -> Vec<Block> {
369 let mut blocks = Vec::new();
370 let mut current = String::new();
371
372 for line in lines {
373 let trimmed = line.trim();
374 if trimmed.is_empty() && !current.is_empty() {
375 blocks.push(Block {
376 content: current.clone(),
377 });
378 current.clear();
379 } else if !trimmed.is_empty() {
380 current.push_str(trimmed);
381 current.push('\n');
382 }
383 }
384
385 if !current.is_empty() {
386 blocks.push(Block { content: current });
387 }
388
389 blocks
390}
391
392fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
393 let sets: Vec<HashSet<Vec<String>>> = blocks.iter().map(|b| ngram_set(&b.content, 2)).collect();
397 let sizes: Vec<usize> = sets.iter().map(std::collections::HashSet::len).collect();
398
399 let mut groups: Vec<Vec<usize>> = Vec::new();
400 let mut assigned: HashSet<usize> = HashSet::new();
401
402 for i in 0..blocks.len() {
403 if assigned.contains(&i) {
404 continue;
405 }
406 let mut group = vec![i];
407 for j in (i + 1)..blocks.len() {
408 if assigned.contains(&j) {
409 continue;
410 }
411 let size_i = sizes[i];
412 let size_j = sizes[j];
413 let min_sz = size_i.min(size_j);
414 let max_sz = size_i.max(size_j);
415 if max_sz > 0 && (min_sz as f64) < (threshold * max_sz as f64) {
416 continue;
417 }
418 let inter = sets[i].intersection(&sets[j]).count();
419 let union = size_i + size_j - inter;
420 if union > 0 && (inter as f64 / union as f64) >= threshold {
421 group.push(j);
422 assigned.insert(j);
423 }
424 }
425 if group.len() > 1 {
426 assigned.insert(i);
427 }
428 groups.push(group);
429 }
430
431 groups
432}
433
434#[cfg(test)]
435mod tests {
436 use super::*;
437
438 #[test]
439 fn shannon_entropy_empty_is_zero() {
440 assert_eq!(shannon_entropy(""), 0.0);
441 }
442
443 #[test]
444 fn shannon_entropy_single_char() {
445 assert_eq!(shannon_entropy("aaaa"), 0.0);
446 }
447
448 #[test]
449 fn shannon_entropy_high_for_varied_text() {
450 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
451 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
452 assert!(
453 shannon_entropy(varied) > shannon_entropy(uniform),
454 "varied text should have higher entropy"
455 );
456 }
457
458 #[test]
459 fn jaccard_identical_is_one() {
460 let sim = jaccard_similarity("hello world", "hello world");
461 assert!((sim - 1.0).abs() < f64::EPSILON);
462 }
463
464 #[test]
465 fn jaccard_disjoint_is_zero() {
466 let sim = jaccard_similarity("abc", "xyz");
467 assert_eq!(sim, 0.0);
468 }
469
470 #[test]
471 fn jaccard_partial_overlap() {
472 let sim = jaccard_similarity("hello world", "hello rust");
473 assert!(sim > 0.0 && sim < 1.0);
474 }
475
476 #[test]
477 fn entropy_compress_produces_output() {
478 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
479 let result = entropy_compress(content);
480 assert!(!result.output.is_empty(), "should produce non-empty output");
481 assert!(result.compressed_tokens <= result.original_tokens);
482 }
483
484 #[test]
485 fn entropy_result_savings() {
486 let r = EntropyResult {
487 output: "short".to_string(),
488 original_tokens: 100,
489 compressed_tokens: 60,
490 techniques: vec!["test".to_string()],
491 };
492 assert!((r.savings_percent() - 40.0).abs() < 0.1);
493 }
494
495 #[test]
496 fn entropy_result_zero_original() {
497 let r = EntropyResult {
498 output: String::new(),
499 original_tokens: 0,
500 compressed_tokens: 0,
501 techniques: vec![],
502 };
503 assert_eq!(r.savings_percent(), 0.0);
504 }
505
506 #[test]
507 fn token_entropy_empty_is_zero() {
508 assert_eq!(token_entropy(""), 0.0);
509 }
510
511 #[test]
512 fn token_entropy_single_repeated_token() {
513 assert_eq!(token_entropy("}"), 0.0);
514 }
515
516 #[test]
517 fn token_entropy_higher_for_diverse_code() {
518 let diverse = "let result = compute_something(x, y, z);";
519 let repetitive = "aaaa aaaa aaaa aaaa";
520 assert!(
521 token_entropy(diverse) > token_entropy(repetitive),
522 "diverse code should have higher BPE token entropy"
523 );
524 }
525
526 #[test]
527 fn token_entropy_vs_char_entropy_differ() {
528 let code = "fn main() { println!(\"hello world\"); }";
529 let te = token_entropy(code);
530 let ce = shannon_entropy(code);
531 assert!(te != ce, "BPE and char entropy should differ for code");
532 }
533
534 #[test]
535 fn ngram_jaccard_preserves_order() {
536 let a = "a b c d";
537 let b = "d c b a";
538 let word_j = jaccard_similarity(a, b);
539 let ngram_j = ngram_jaccard(a, b, 2);
540 assert!(
541 ngram_j < word_j,
542 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
543 );
544 }
545
546 #[test]
547 fn ngram_jaccard_identical_is_one() {
548 let text = "fn main() { println!(\"hello\"); }";
549 let j = ngram_jaccard(text, text, 2);
550 assert!((j - 1.0).abs() < f64::EPSILON);
551 }
552
553 #[test]
554 fn ngram_jaccard_disjoint_is_zero() {
555 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
556 assert_eq!(j, 0.0);
557 }
558
559 #[test]
560 fn minhash_approximates_jaccard() {
561 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
562 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
563 let exact = ngram_jaccard(a, b, 2);
564 let sig_a = minhash_signature(a, 2, 128);
565 let sig_b = minhash_signature(b, 2, 128);
566 let approx = minhash_similarity(&sig_a, &sig_b);
567 assert!(
568 (exact - approx).abs() < 0.2,
569 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
570 );
571 }
572
573 #[test]
574 fn minhash_empty_text() {
575 let sig = minhash_signature("", 2, 64);
576 assert!(sig.iter().all(|&v| v == u64::MAX));
577 }
578
579 #[test]
580 fn kolmogorov_empty_is_one() {
581 assert_eq!(kolmogorov_proxy(""), 1.0);
582 }
583
584 #[test]
585 fn kolmogorov_repetitive_is_low() {
586 let repetitive = "aaa\n".repeat(1000);
587 let k = kolmogorov_proxy(&repetitive);
588 assert!(
589 k < 0.1,
590 "highly repetitive text should compress well: K={k}"
591 );
592 }
593
594 #[test]
595 fn kolmogorov_diverse_is_higher() {
596 let repetitive = "aaa\n".repeat(500);
597 let diverse = (0..500)
598 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
599 .collect::<Vec<_>>()
600 .join("\n");
601 assert!(
602 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
603 "diverse content should have higher K than repetitive"
604 );
605 }
606
607 #[test]
608 fn compressibility_class_repetitive_is_high() {
609 let text = "use std::io;\n".repeat(200);
610 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
611 }
612
613 #[test]
614 fn kolmogorov_diverse_higher_than_repetitive() {
615 let rep = "test\n".repeat(500);
616 let diverse = (0..500)
617 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
618 .collect::<Vec<_>>()
619 .join("\n");
620 assert!(
621 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
622 "diverse content should have higher K"
623 );
624 }
625}