1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11#[allow(dead_code)]
12const MINHASH_NUM_HASHES: usize = 128;
13
14#[derive(Debug)]
15#[allow(dead_code)]
16pub struct EntropyResult {
17 pub output: String,
18 pub original_tokens: usize,
19 pub compressed_tokens: usize,
20 pub techniques: Vec<String>,
21}
22
23#[allow(dead_code)]
24impl EntropyResult {
25 pub fn savings_percent(&self) -> f64 {
26 if self.original_tokens == 0 {
27 return 0.0;
28 }
29 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
30 (saved as f64 / self.original_tokens as f64) * 100.0
31 }
32}
33
34#[allow(dead_code)]
35pub fn shannon_entropy(text: &str) -> f64 {
36 if text.is_empty() {
37 return 0.0;
38 }
39 let mut freq: HashMap<char, usize> = HashMap::new();
40 let total = text.chars().count();
41
42 for c in text.chars() {
43 *freq.entry(c).or_default() += 1;
44 }
45
46 freq.values().fold(0.0_f64, |acc, &count| {
47 let p = count as f64 / total as f64;
48 acc - p * p.log2()
49 })
50}
51
52pub fn token_entropy(text: &str) -> f64 {
55 let tokens = encode_tokens(text);
56 if tokens.is_empty() {
57 return 0.0;
58 }
59 let total = tokens.len();
60 let mut freq: HashMap<u32, usize> = HashMap::new();
61 for &t in &tokens {
62 *freq.entry(t).or_default() += 1;
63 }
64 freq.values().fold(0.0_f64, |acc, &count| {
65 let p = count as f64 / total as f64;
66 acc - p * p.log2()
67 })
68}
69
70pub fn normalized_token_entropy(text: &str) -> f64 {
74 let tokens = encode_tokens(text);
75 if tokens.is_empty() {
76 return 0.0;
77 }
78 let total = tokens.len();
79 let mut freq: HashMap<u32, usize> = HashMap::new();
80 for &t in &tokens {
81 *freq.entry(t).or_default() += 1;
82 }
83 let n_unique = freq.len();
84 if n_unique <= 1 {
85 return 0.0;
86 }
87 let h = freq.values().fold(0.0_f64, |acc, &count| {
88 let p = count as f64 / total as f64;
89 acc - p * p.log2()
90 });
91 let h_max = (n_unique as f64).log2();
92 h / h_max
93}
94
95#[allow(dead_code)]
96pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
97 let set_a: HashSet<&str> = a.split_whitespace().collect();
98 let set_b: HashSet<&str> = b.split_whitespace().collect();
99
100 let intersection = set_a.intersection(&set_b).count();
101 let union = set_a.union(&set_b).count();
102
103 if union == 0 {
104 return 0.0;
105 }
106 intersection as f64 / union as f64
107}
108
109pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
111 let set_a = ngram_set(a, n);
112 let set_b = ngram_set(b, n);
113
114 let intersection = set_a.intersection(&set_b).count();
115 let union = set_a.union(&set_b).count();
116
117 if union == 0 {
118 return 0.0;
119 }
120 intersection as f64 / union as f64
121}
122
123fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
124 let words: Vec<&str> = text.split_whitespace().collect();
125 if words.len() < n {
126 let mut set = HashSet::new();
127 if !words.is_empty() {
128 set.insert(words.iter().map(|w| w.to_string()).collect());
129 }
130 return set;
131 }
132 words
133 .windows(n)
134 .map(|w| w.iter().map(|s| s.to_string()).collect())
135 .collect()
136}
137
138#[allow(dead_code)]
141pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
142 let ngrams = ngram_set(text, n);
143 if ngrams.is_empty() {
144 return vec![u64::MAX; k];
145 }
146 let mut signature = vec![u64::MAX; k];
147 for ngram in &ngrams {
148 for (i, min) in signature.iter_mut().enumerate() {
149 let h = hash_with_seed(ngram, i as u64);
150 if h < *min {
151 *min = h;
152 }
153 }
154 }
155 signature
156}
157
158#[allow(dead_code)]
160pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
161 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
162 return 0.0;
163 }
164 let matches = sig_a
165 .iter()
166 .zip(sig_b.iter())
167 .filter(|(a, b)| a == b)
168 .count();
169 matches as f64 / sig_a.len() as f64
170}
171
172#[allow(dead_code)]
173fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
174 let mut hasher = std::collections::hash_map::DefaultHasher::new();
175 seed.hash(&mut hasher);
176 value.hash(&mut hasher);
177 hasher.finish()
178}
179
180pub fn kolmogorov_proxy(content: &str) -> f64 {
183 if content.is_empty() {
184 return 1.0;
185 }
186 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
187 encoder.write_all(content.as_bytes()).ok();
188 let compressed = encoder.finish().unwrap_or_default();
189 compressed.len() as f64 / content.len() as f64
190}
191
192#[derive(Debug, Clone, Copy, PartialEq, Eq)]
193pub enum CompressibilityClass {
194 High,
195 Medium,
196 Low,
197}
198
199impl CompressibilityClass {
200 pub fn label(&self) -> &'static str {
201 match self {
202 Self::High => "high (K<0.3)",
203 Self::Medium => "medium (0.3≤K<0.6)",
204 Self::Low => "low (K≥0.6)",
205 }
206 }
207}
208
209pub fn compressibility_class(content: &str) -> CompressibilityClass {
211 let k = kolmogorov_proxy(content);
212 if k < 0.3 {
213 CompressibilityClass::High
214 } else if k < 0.6 {
215 CompressibilityClass::Medium
216 } else {
217 CompressibilityClass::Low
218 }
219}
220
221pub fn entropy_compress(content: &str) -> EntropyResult {
222 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
223}
224
225pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
226 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
227 let before_lines = content.lines().count() as u32;
228 let result =
229 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard);
230 let after_lines = result.output.lines().count() as u32;
231
232 if before_lines != after_lines {
233 super::events::emit(super::events::EventKind::Compression {
234 path: path.to_string(),
235 before_lines,
236 after_lines,
237 strategy: "entropy_adaptive".to_string(),
238 kept_line_count: after_lines,
239 removed_line_count: before_lines.saturating_sub(after_lines),
240 });
241 }
242
243 result
244}
245
246fn entropy_compress_with_thresholds(
247 content: &str,
248 entropy_threshold: f64,
249 jaccard_threshold: f64,
250) -> EntropyResult {
251 let original_tokens = count_tokens(content);
252 let mut lines: Vec<&str> = content.lines().collect();
253 let mut techniques = Vec::new();
254
255 let original_count = lines.len();
256 lines.retain(|line| {
257 let trimmed = line.trim();
258 super::surprise::should_keep_line(trimmed, entropy_threshold)
259 });
260 let removed = original_count - lines.len();
261 if removed > 0 {
262 techniques.push(format!(
263 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
264 ));
265 }
266
267 let blocks = extract_blocks(&lines);
268 let groups = find_pattern_groups(&blocks, jaccard_threshold);
269 let mut dedup_count = 0;
270 for group in &groups {
271 if group.len() > 1 {
272 dedup_count += group.len() - 1;
273 }
274 }
275 if dedup_count > 0 {
276 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
277 }
278
279 let mut result: Vec<String> = Vec::new();
280 let mut skip_indices: HashSet<usize> = HashSet::new();
281 for group in &groups {
282 if group.len() > 1 {
283 for &idx in &group[1..] {
284 skip_indices.insert(idx);
285 }
286 }
287 }
288 for (i, line) in lines.iter().enumerate() {
289 if !skip_indices.contains(&i) {
290 result.push(line.to_string());
291 }
292 }
293
294 let mut collapsed = Vec::new();
295 let mut blank_count = 0;
296 for line in &result {
297 if line.trim().is_empty() {
298 blank_count += 1;
299 if blank_count <= 1 {
300 collapsed.push(line.clone());
301 }
302 } else {
303 blank_count = 0;
304 collapsed.push(line.clone());
305 }
306 }
307
308 let output = collapsed.join("\n");
309 let compressed_tokens = count_tokens(&output);
310
311 EntropyResult {
312 output,
313 original_tokens,
314 compressed_tokens,
315 techniques,
316 }
317}
318
319#[derive(Debug)]
320pub struct EntropyAnalysis {
321 pub avg_entropy: f64,
322 pub low_entropy_count: usize,
323 pub high_entropy_count: usize,
324 pub total_lines: usize,
325}
326
327pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
328 let lines: Vec<&str> = content.lines().collect();
329 let total = lines.len();
330 let mut sum = 0.0;
331 let mut low = 0;
332 let mut high = 0;
333 let mut counted = 0;
334
335 for line in &lines {
336 let trimmed = line.trim();
337 if trimmed.is_empty() {
338 continue;
339 }
340 let h = token_entropy(trimmed);
341 sum += h;
342 counted += 1;
343 if h < BPE_ENTROPY_THRESHOLD {
344 low += 1;
345 }
346 if h > 3.0 {
347 high += 1;
348 }
349 }
350
351 EntropyAnalysis {
352 avg_entropy: if counted > 0 {
353 sum / counted as f64
354 } else {
355 0.0
356 },
357 low_entropy_count: low,
358 high_entropy_count: high,
359 total_lines: total,
360 }
361}
362
363#[allow(dead_code)]
364struct Block {
365 start: usize,
366 content: String,
367}
368
369fn extract_blocks(lines: &[&str]) -> Vec<Block> {
370 let mut blocks = Vec::new();
371 let mut current = String::new();
372 let mut start = 0;
373
374 for (i, line) in lines.iter().enumerate() {
375 let trimmed = line.trim();
376 if trimmed.is_empty() && !current.is_empty() {
377 blocks.push(Block {
378 start,
379 content: current.clone(),
380 });
381 current.clear();
382 } else if !trimmed.is_empty() {
383 if current.is_empty() {
384 start = i;
385 }
386 current.push_str(trimmed);
387 current.push('\n');
388 }
389 }
390
391 if !current.is_empty() {
392 blocks.push(Block {
393 start,
394 content: current,
395 });
396 }
397
398 blocks
399}
400
401fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
402 let mut groups: Vec<Vec<usize>> = Vec::new();
403 let mut assigned: HashSet<usize> = HashSet::new();
404
405 for (i, block_a) in blocks.iter().enumerate() {
406 if assigned.contains(&i) {
407 continue;
408 }
409 let mut group = vec![i];
410 for (j, block_b) in blocks.iter().enumerate().skip(i + 1) {
411 if assigned.contains(&j) {
412 continue;
413 }
414 if ngram_jaccard(&block_a.content, &block_b.content, 2) >= threshold {
415 group.push(j);
416 assigned.insert(j);
417 }
418 }
419 if group.len() > 1 {
420 assigned.insert(i);
421 }
422 groups.push(group);
423 }
424
425 groups
426}
427
428#[cfg(test)]
429mod tests {
430 use super::*;
431
432 #[test]
433 fn shannon_entropy_empty_is_zero() {
434 assert_eq!(shannon_entropy(""), 0.0);
435 }
436
437 #[test]
438 fn shannon_entropy_single_char() {
439 assert_eq!(shannon_entropy("aaaa"), 0.0);
440 }
441
442 #[test]
443 fn shannon_entropy_high_for_varied_text() {
444 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
445 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
446 assert!(
447 shannon_entropy(varied) > shannon_entropy(uniform),
448 "varied text should have higher entropy"
449 );
450 }
451
452 #[test]
453 fn jaccard_identical_is_one() {
454 let sim = jaccard_similarity("hello world", "hello world");
455 assert!((sim - 1.0).abs() < f64::EPSILON);
456 }
457
458 #[test]
459 fn jaccard_disjoint_is_zero() {
460 let sim = jaccard_similarity("abc", "xyz");
461 assert_eq!(sim, 0.0);
462 }
463
464 #[test]
465 fn jaccard_partial_overlap() {
466 let sim = jaccard_similarity("hello world", "hello rust");
467 assert!(sim > 0.0 && sim < 1.0);
468 }
469
470 #[test]
471 fn entropy_compress_produces_output() {
472 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
473 let result = entropy_compress(content);
474 assert!(!result.output.is_empty(), "should produce non-empty output");
475 assert!(result.compressed_tokens <= result.original_tokens);
476 }
477
478 #[test]
479 fn entropy_result_savings() {
480 let r = EntropyResult {
481 output: "short".to_string(),
482 original_tokens: 100,
483 compressed_tokens: 60,
484 techniques: vec!["test".to_string()],
485 };
486 assert!((r.savings_percent() - 40.0).abs() < 0.1);
487 }
488
489 #[test]
490 fn entropy_result_zero_original() {
491 let r = EntropyResult {
492 output: String::new(),
493 original_tokens: 0,
494 compressed_tokens: 0,
495 techniques: vec![],
496 };
497 assert_eq!(r.savings_percent(), 0.0);
498 }
499
500 #[test]
501 fn token_entropy_empty_is_zero() {
502 assert_eq!(token_entropy(""), 0.0);
503 }
504
505 #[test]
506 fn token_entropy_single_repeated_token() {
507 assert_eq!(token_entropy("}"), 0.0);
508 }
509
510 #[test]
511 fn token_entropy_higher_for_diverse_code() {
512 let diverse = "let result = compute_something(x, y, z);";
513 let repetitive = "aaaa aaaa aaaa aaaa";
514 assert!(
515 token_entropy(diverse) > token_entropy(repetitive),
516 "diverse code should have higher BPE token entropy"
517 );
518 }
519
520 #[test]
521 fn token_entropy_vs_char_entropy_differ() {
522 let code = "fn main() { println!(\"hello world\"); }";
523 let te = token_entropy(code);
524 let ce = shannon_entropy(code);
525 assert!(te != ce, "BPE and char entropy should differ for code");
526 }
527
528 #[test]
529 fn ngram_jaccard_preserves_order() {
530 let a = "a b c d";
531 let b = "d c b a";
532 let word_j = jaccard_similarity(a, b);
533 let ngram_j = ngram_jaccard(a, b, 2);
534 assert!(
535 ngram_j < word_j,
536 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
537 );
538 }
539
540 #[test]
541 fn ngram_jaccard_identical_is_one() {
542 let text = "fn main() { println!(\"hello\"); }";
543 let j = ngram_jaccard(text, text, 2);
544 assert!((j - 1.0).abs() < f64::EPSILON);
545 }
546
547 #[test]
548 fn ngram_jaccard_disjoint_is_zero() {
549 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
550 assert_eq!(j, 0.0);
551 }
552
553 #[test]
554 fn minhash_approximates_jaccard() {
555 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
556 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
557 let exact = ngram_jaccard(a, b, 2);
558 let sig_a = minhash_signature(a, 2, MINHASH_NUM_HASHES);
559 let sig_b = minhash_signature(b, 2, MINHASH_NUM_HASHES);
560 let approx = minhash_similarity(&sig_a, &sig_b);
561 assert!(
562 (exact - approx).abs() < 0.2,
563 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
564 );
565 }
566
567 #[test]
568 fn minhash_empty_text() {
569 let sig = minhash_signature("", 2, 64);
570 assert!(sig.iter().all(|&v| v == u64::MAX));
571 }
572
573 #[test]
574 fn kolmogorov_empty_is_one() {
575 assert_eq!(kolmogorov_proxy(""), 1.0);
576 }
577
578 #[test]
579 fn kolmogorov_repetitive_is_low() {
580 let repetitive = "aaa\n".repeat(1000);
581 let k = kolmogorov_proxy(&repetitive);
582 assert!(
583 k < 0.1,
584 "highly repetitive text should compress well: K={k}"
585 );
586 }
587
588 #[test]
589 fn kolmogorov_diverse_is_higher() {
590 let repetitive = "aaa\n".repeat(500);
591 let diverse = (0..500)
592 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
593 .collect::<Vec<_>>()
594 .join("\n");
595 assert!(
596 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
597 "diverse content should have higher K than repetitive"
598 );
599 }
600
601 #[test]
602 fn compressibility_class_repetitive_is_high() {
603 let text = "use std::io;\n".repeat(200);
604 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
605 }
606
607 #[test]
608 fn kolmogorov_diverse_higher_than_repetitive() {
609 let rep = "test\n".repeat(500);
610 let diverse = (0..500)
611 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
612 .collect::<Vec<_>>()
613 .join("\n");
614 assert!(
615 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
616 "diverse content should have higher K"
617 );
618 }
619}