1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11#[allow(dead_code)]
12const MINHASH_NUM_HASHES: usize = 128;
13
14#[derive(Debug)]
15#[allow(dead_code)]
16pub struct EntropyResult {
17 pub output: String,
18 pub original_tokens: usize,
19 pub compressed_tokens: usize,
20 pub techniques: Vec<String>,
21}
22
23#[allow(dead_code)]
24impl EntropyResult {
25 pub fn savings_percent(&self) -> f64 {
26 if self.original_tokens == 0 {
27 return 0.0;
28 }
29 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
30 (saved as f64 / self.original_tokens as f64) * 100.0
31 }
32}
33
34#[allow(dead_code)]
35pub fn shannon_entropy(text: &str) -> f64 {
36 if text.is_empty() {
37 return 0.0;
38 }
39 let mut freq: HashMap<char, usize> = HashMap::new();
40 let total = text.chars().count();
41
42 for c in text.chars() {
43 *freq.entry(c).or_default() += 1;
44 }
45
46 freq.values().fold(0.0_f64, |acc, &count| {
47 let p = count as f64 / total as f64;
48 acc - p * p.log2()
49 })
50}
51
52pub fn token_entropy(text: &str) -> f64 {
55 let tokens = encode_tokens(text);
56 if tokens.is_empty() {
57 return 0.0;
58 }
59 let total = tokens.len();
60 let mut freq: HashMap<u32, usize> = HashMap::new();
61 for &t in &tokens {
62 *freq.entry(t).or_default() += 1;
63 }
64 freq.values().fold(0.0_f64, |acc, &count| {
65 let p = count as f64 / total as f64;
66 acc - p * p.log2()
67 })
68}
69
70pub fn normalized_token_entropy(text: &str) -> f64 {
74 let tokens = encode_tokens(text);
75 if tokens.is_empty() {
76 return 0.0;
77 }
78 let total = tokens.len();
79 let mut freq: HashMap<u32, usize> = HashMap::new();
80 for &t in &tokens {
81 *freq.entry(t).or_default() += 1;
82 }
83 let n_unique = freq.len();
84 if n_unique <= 1 {
85 return 0.0;
86 }
87 let h = freq.values().fold(0.0_f64, |acc, &count| {
88 let p = count as f64 / total as f64;
89 acc - p * p.log2()
90 });
91 let h_max = (n_unique as f64).log2();
92 h / h_max
93}
94
95#[allow(dead_code)]
96pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
97 let set_a: HashSet<&str> = a.split_whitespace().collect();
98 let set_b: HashSet<&str> = b.split_whitespace().collect();
99
100 let intersection = set_a.intersection(&set_b).count();
101 let union = set_a.union(&set_b).count();
102
103 if union == 0 {
104 return 0.0;
105 }
106 intersection as f64 / union as f64
107}
108
109pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
111 let set_a = ngram_set(a, n);
112 let set_b = ngram_set(b, n);
113
114 let intersection = set_a.intersection(&set_b).count();
115 let union = set_a.union(&set_b).count();
116
117 if union == 0 {
118 return 0.0;
119 }
120 intersection as f64 / union as f64
121}
122
123fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
124 let words: Vec<&str> = text.split_whitespace().collect();
125 if words.len() < n {
126 let mut set = HashSet::new();
127 if !words.is_empty() {
128 set.insert(words.iter().map(|w| w.to_string()).collect());
129 }
130 return set;
131 }
132 words
133 .windows(n)
134 .map(|w| w.iter().map(|s| s.to_string()).collect())
135 .collect()
136}
137
138#[allow(dead_code)]
141pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
142 let ngrams = ngram_set(text, n);
143 if ngrams.is_empty() {
144 return vec![u64::MAX; k];
145 }
146 let mut signature = vec![u64::MAX; k];
147 for ngram in &ngrams {
148 for (i, min) in signature.iter_mut().enumerate() {
149 let h = hash_with_seed(ngram, i as u64);
150 if h < *min {
151 *min = h;
152 }
153 }
154 }
155 signature
156}
157
158#[allow(dead_code)]
160pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
161 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
162 return 0.0;
163 }
164 let matches = sig_a
165 .iter()
166 .zip(sig_b.iter())
167 .filter(|(a, b)| a == b)
168 .count();
169 matches as f64 / sig_a.len() as f64
170}
171
172#[allow(dead_code)]
173fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
174 let mut hasher = std::collections::hash_map::DefaultHasher::new();
175 seed.hash(&mut hasher);
176 value.hash(&mut hasher);
177 hasher.finish()
178}
179
180pub fn kolmogorov_proxy(content: &str) -> f64 {
183 if content.is_empty() {
184 return 1.0;
185 }
186 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
187 encoder.write_all(content.as_bytes()).ok();
188 let compressed = encoder.finish().unwrap_or_default();
189 compressed.len() as f64 / content.len() as f64
190}
191
192#[derive(Debug, Clone, Copy, PartialEq, Eq)]
193pub enum CompressibilityClass {
194 High,
195 Medium,
196 Low,
197}
198
199impl CompressibilityClass {
200 pub fn label(&self) -> &'static str {
201 match self {
202 Self::High => "high (K<0.3)",
203 Self::Medium => "medium (0.3≤K<0.6)",
204 Self::Low => "low (K≥0.6)",
205 }
206 }
207}
208
209pub fn compressibility_class(content: &str) -> CompressibilityClass {
211 let k = kolmogorov_proxy(content);
212 if k < 0.3 {
213 CompressibilityClass::High
214 } else if k < 0.6 {
215 CompressibilityClass::Medium
216 } else {
217 CompressibilityClass::Low
218 }
219}
220
221pub fn entropy_compress(content: &str) -> EntropyResult {
222 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
223}
224
225pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
226 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
227 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard)
228}
229
230fn entropy_compress_with_thresholds(
231 content: &str,
232 entropy_threshold: f64,
233 jaccard_threshold: f64,
234) -> EntropyResult {
235 let original_tokens = count_tokens(content);
236 let mut lines: Vec<&str> = content.lines().collect();
237 let mut techniques = Vec::new();
238
239 let original_count = lines.len();
240 lines.retain(|line| {
241 let trimmed = line.trim();
242 super::surprise::should_keep_line(trimmed, entropy_threshold)
243 });
244 let removed = original_count - lines.len();
245 if removed > 0 {
246 techniques.push(format!(
247 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
248 ));
249 }
250
251 let blocks = extract_blocks(&lines);
252 let groups = find_pattern_groups(&blocks, jaccard_threshold);
253 let mut dedup_count = 0;
254 for group in &groups {
255 if group.len() > 1 {
256 dedup_count += group.len() - 1;
257 }
258 }
259 if dedup_count > 0 {
260 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
261 }
262
263 let mut result: Vec<String> = Vec::new();
264 let mut skip_indices: HashSet<usize> = HashSet::new();
265 for group in &groups {
266 if group.len() > 1 {
267 for &idx in &group[1..] {
268 skip_indices.insert(idx);
269 }
270 }
271 }
272 for (i, line) in lines.iter().enumerate() {
273 if !skip_indices.contains(&i) {
274 result.push(line.to_string());
275 }
276 }
277
278 let mut collapsed = Vec::new();
279 let mut blank_count = 0;
280 for line in &result {
281 if line.trim().is_empty() {
282 blank_count += 1;
283 if blank_count <= 1 {
284 collapsed.push(line.clone());
285 }
286 } else {
287 blank_count = 0;
288 collapsed.push(line.clone());
289 }
290 }
291
292 let output = collapsed.join("\n");
293 let compressed_tokens = count_tokens(&output);
294
295 EntropyResult {
296 output,
297 original_tokens,
298 compressed_tokens,
299 techniques,
300 }
301}
302
303#[derive(Debug)]
304pub struct EntropyAnalysis {
305 pub avg_entropy: f64,
306 pub low_entropy_count: usize,
307 pub high_entropy_count: usize,
308 pub total_lines: usize,
309}
310
311pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
312 let lines: Vec<&str> = content.lines().collect();
313 let total = lines.len();
314 let mut sum = 0.0;
315 let mut low = 0;
316 let mut high = 0;
317 let mut counted = 0;
318
319 for line in &lines {
320 let trimmed = line.trim();
321 if trimmed.is_empty() {
322 continue;
323 }
324 let h = token_entropy(trimmed);
325 sum += h;
326 counted += 1;
327 if h < BPE_ENTROPY_THRESHOLD {
328 low += 1;
329 }
330 if h > 3.0 {
331 high += 1;
332 }
333 }
334
335 EntropyAnalysis {
336 avg_entropy: if counted > 0 {
337 sum / counted as f64
338 } else {
339 0.0
340 },
341 low_entropy_count: low,
342 high_entropy_count: high,
343 total_lines: total,
344 }
345}
346
347#[allow(dead_code)]
348struct Block {
349 start: usize,
350 content: String,
351}
352
353fn extract_blocks(lines: &[&str]) -> Vec<Block> {
354 let mut blocks = Vec::new();
355 let mut current = String::new();
356 let mut start = 0;
357
358 for (i, line) in lines.iter().enumerate() {
359 let trimmed = line.trim();
360 if trimmed.is_empty() && !current.is_empty() {
361 blocks.push(Block {
362 start,
363 content: current.clone(),
364 });
365 current.clear();
366 } else if !trimmed.is_empty() {
367 if current.is_empty() {
368 start = i;
369 }
370 current.push_str(trimmed);
371 current.push('\n');
372 }
373 }
374
375 if !current.is_empty() {
376 blocks.push(Block {
377 start,
378 content: current,
379 });
380 }
381
382 blocks
383}
384
385fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
386 let mut groups: Vec<Vec<usize>> = Vec::new();
387 let mut assigned: HashSet<usize> = HashSet::new();
388
389 for (i, block_a) in blocks.iter().enumerate() {
390 if assigned.contains(&i) {
391 continue;
392 }
393 let mut group = vec![i];
394 for (j, block_b) in blocks.iter().enumerate().skip(i + 1) {
395 if assigned.contains(&j) {
396 continue;
397 }
398 if ngram_jaccard(&block_a.content, &block_b.content, 2) >= threshold {
399 group.push(j);
400 assigned.insert(j);
401 }
402 }
403 if group.len() > 1 {
404 assigned.insert(i);
405 }
406 groups.push(group);
407 }
408
409 groups
410}
411
412#[cfg(test)]
413mod tests {
414 use super::*;
415
416 #[test]
417 fn shannon_entropy_empty_is_zero() {
418 assert_eq!(shannon_entropy(""), 0.0);
419 }
420
421 #[test]
422 fn shannon_entropy_single_char() {
423 assert_eq!(shannon_entropy("aaaa"), 0.0);
424 }
425
426 #[test]
427 fn shannon_entropy_high_for_varied_text() {
428 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
429 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
430 assert!(
431 shannon_entropy(varied) > shannon_entropy(uniform),
432 "varied text should have higher entropy"
433 );
434 }
435
436 #[test]
437 fn jaccard_identical_is_one() {
438 let sim = jaccard_similarity("hello world", "hello world");
439 assert!((sim - 1.0).abs() < f64::EPSILON);
440 }
441
442 #[test]
443 fn jaccard_disjoint_is_zero() {
444 let sim = jaccard_similarity("abc", "xyz");
445 assert_eq!(sim, 0.0);
446 }
447
448 #[test]
449 fn jaccard_partial_overlap() {
450 let sim = jaccard_similarity("hello world", "hello rust");
451 assert!(sim > 0.0 && sim < 1.0);
452 }
453
454 #[test]
455 fn entropy_compress_produces_output() {
456 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
457 let result = entropy_compress(content);
458 assert!(!result.output.is_empty(), "should produce non-empty output");
459 assert!(result.compressed_tokens <= result.original_tokens);
460 }
461
462 #[test]
463 fn entropy_result_savings() {
464 let r = EntropyResult {
465 output: "short".to_string(),
466 original_tokens: 100,
467 compressed_tokens: 60,
468 techniques: vec!["test".to_string()],
469 };
470 assert!((r.savings_percent() - 40.0).abs() < 0.1);
471 }
472
473 #[test]
474 fn entropy_result_zero_original() {
475 let r = EntropyResult {
476 output: String::new(),
477 original_tokens: 0,
478 compressed_tokens: 0,
479 techniques: vec![],
480 };
481 assert_eq!(r.savings_percent(), 0.0);
482 }
483
484 #[test]
485 fn token_entropy_empty_is_zero() {
486 assert_eq!(token_entropy(""), 0.0);
487 }
488
489 #[test]
490 fn token_entropy_single_repeated_token() {
491 assert_eq!(token_entropy("}"), 0.0);
492 }
493
494 #[test]
495 fn token_entropy_higher_for_diverse_code() {
496 let diverse = "let result = compute_something(x, y, z);";
497 let repetitive = "aaaa aaaa aaaa aaaa";
498 assert!(
499 token_entropy(diverse) > token_entropy(repetitive),
500 "diverse code should have higher BPE token entropy"
501 );
502 }
503
504 #[test]
505 fn token_entropy_vs_char_entropy_differ() {
506 let code = "fn main() { println!(\"hello world\"); }";
507 let te = token_entropy(code);
508 let ce = shannon_entropy(code);
509 assert!(te != ce, "BPE and char entropy should differ for code");
510 }
511
512 #[test]
513 fn ngram_jaccard_preserves_order() {
514 let a = "a b c d";
515 let b = "d c b a";
516 let word_j = jaccard_similarity(a, b);
517 let ngram_j = ngram_jaccard(a, b, 2);
518 assert!(
519 ngram_j < word_j,
520 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
521 );
522 }
523
524 #[test]
525 fn ngram_jaccard_identical_is_one() {
526 let text = "fn main() { println!(\"hello\"); }";
527 let j = ngram_jaccard(text, text, 2);
528 assert!((j - 1.0).abs() < f64::EPSILON);
529 }
530
531 #[test]
532 fn ngram_jaccard_disjoint_is_zero() {
533 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
534 assert_eq!(j, 0.0);
535 }
536
537 #[test]
538 fn minhash_approximates_jaccard() {
539 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
540 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
541 let exact = ngram_jaccard(a, b, 2);
542 let sig_a = minhash_signature(a, 2, MINHASH_NUM_HASHES);
543 let sig_b = minhash_signature(b, 2, MINHASH_NUM_HASHES);
544 let approx = minhash_similarity(&sig_a, &sig_b);
545 assert!(
546 (exact - approx).abs() < 0.2,
547 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
548 );
549 }
550
551 #[test]
552 fn minhash_empty_text() {
553 let sig = minhash_signature("", 2, 64);
554 assert!(sig.iter().all(|&v| v == u64::MAX));
555 }
556
557 #[test]
558 fn kolmogorov_empty_is_one() {
559 assert_eq!(kolmogorov_proxy(""), 1.0);
560 }
561
562 #[test]
563 fn kolmogorov_repetitive_is_low() {
564 let repetitive = "aaa\n".repeat(1000);
565 let k = kolmogorov_proxy(&repetitive);
566 assert!(
567 k < 0.1,
568 "highly repetitive text should compress well: K={k}"
569 );
570 }
571
572 #[test]
573 fn kolmogorov_diverse_is_higher() {
574 let repetitive = "aaa\n".repeat(500);
575 let diverse = (0..500)
576 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
577 .collect::<Vec<_>>()
578 .join("\n");
579 assert!(
580 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
581 "diverse content should have higher K than repetitive"
582 );
583 }
584
585 #[test]
586 fn compressibility_class_repetitive_is_high() {
587 let text = "use std::io;\n".repeat(200);
588 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
589 }
590
591 #[test]
592 fn kolmogorov_diverse_higher_than_repetitive() {
593 let rep = "test\n".repeat(500);
594 let diverse = (0..500)
595 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
596 .collect::<Vec<_>>()
597 .join("\n");
598 assert!(
599 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
600 "diverse content should have higher K"
601 );
602 }
603}