1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11#[allow(dead_code)]
12const MINHASH_NUM_HASHES: usize = 128;
13
14#[derive(Debug)]
15#[allow(dead_code)]
16pub struct EntropyResult {
17 pub output: String,
18 pub original_tokens: usize,
19 pub compressed_tokens: usize,
20 pub techniques: Vec<String>,
21}
22
23#[allow(dead_code)]
24impl EntropyResult {
25 pub fn savings_percent(&self) -> f64 {
26 if self.original_tokens == 0 {
27 return 0.0;
28 }
29 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
30 (saved as f64 / self.original_tokens as f64) * 100.0
31 }
32}
33
34#[allow(dead_code)]
35pub fn shannon_entropy(text: &str) -> f64 {
36 if text.is_empty() {
37 return 0.0;
38 }
39 let mut freq: HashMap<char, usize> = HashMap::new();
40 let total = text.chars().count();
41
42 for c in text.chars() {
43 *freq.entry(c).or_default() += 1;
44 }
45
46 freq.values().fold(0.0_f64, |acc, &count| {
47 let p = count as f64 / total as f64;
48 acc - p * p.log2()
49 })
50}
51
52pub fn token_entropy(text: &str) -> f64 {
55 let tokens = encode_tokens(text);
56 if tokens.is_empty() {
57 return 0.0;
58 }
59 let total = tokens.len();
60 let mut freq: HashMap<u32, usize> = HashMap::new();
61 for &t in &tokens {
62 *freq.entry(t).or_default() += 1;
63 }
64 freq.values().fold(0.0_f64, |acc, &count| {
65 let p = count as f64 / total as f64;
66 acc - p * p.log2()
67 })
68}
69
70pub fn normalized_token_entropy(text: &str) -> f64 {
74 let tokens = encode_tokens(text);
75 if tokens.is_empty() {
76 return 0.0;
77 }
78 let total = tokens.len();
79 let mut freq: HashMap<u32, usize> = HashMap::new();
80 for &t in &tokens {
81 *freq.entry(t).or_default() += 1;
82 }
83 let n_unique = freq.len();
84 if n_unique <= 1 {
85 return 0.0;
86 }
87 let h = freq.values().fold(0.0_f64, |acc, &count| {
88 let p = count as f64 / total as f64;
89 acc - p * p.log2()
90 });
91 let h_max = (n_unique as f64).log2();
92 h / h_max
93}
94
95#[allow(dead_code)]
96pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
97 let set_a: HashSet<&str> = a.split_whitespace().collect();
98 let set_b: HashSet<&str> = b.split_whitespace().collect();
99
100 let intersection = set_a.intersection(&set_b).count();
101 let union = set_a.union(&set_b).count();
102
103 if union == 0 {
104 return 0.0;
105 }
106 intersection as f64 / union as f64
107}
108
109pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
111 let set_a = ngram_set(a, n);
112 let set_b = ngram_set(b, n);
113
114 let intersection = set_a.intersection(&set_b).count();
115 let union = set_a.union(&set_b).count();
116
117 if union == 0 {
118 return 0.0;
119 }
120 intersection as f64 / union as f64
121}
122
123fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
124 let words: Vec<&str> = text.split_whitespace().collect();
125 if words.len() < n {
126 let mut set = HashSet::new();
127 if !words.is_empty() {
128 set.insert(words.iter().map(|w| w.to_string()).collect());
129 }
130 return set;
131 }
132 words
133 .windows(n)
134 .map(|w| w.iter().map(|s| s.to_string()).collect())
135 .collect()
136}
137
138#[allow(dead_code)]
141pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
142 let ngrams = ngram_set(text, n);
143 if ngrams.is_empty() {
144 return vec![u64::MAX; k];
145 }
146 let mut signature = vec![u64::MAX; k];
147 for ngram in &ngrams {
148 for (i, min) in signature.iter_mut().enumerate() {
149 let h = hash_with_seed(ngram, i as u64);
150 if h < *min {
151 *min = h;
152 }
153 }
154 }
155 signature
156}
157
158#[allow(dead_code)]
160pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
161 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
162 return 0.0;
163 }
164 let matches = sig_a
165 .iter()
166 .zip(sig_b.iter())
167 .filter(|(a, b)| a == b)
168 .count();
169 matches as f64 / sig_a.len() as f64
170}
171
172#[allow(dead_code)]
173fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
174 let mut hasher = std::collections::hash_map::DefaultHasher::new();
175 seed.hash(&mut hasher);
176 value.hash(&mut hasher);
177 hasher.finish()
178}
179
180pub fn kolmogorov_proxy(content: &str) -> f64 {
183 if content.is_empty() {
184 return 1.0;
185 }
186 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
187 encoder.write_all(content.as_bytes()).ok();
188 let compressed = encoder.finish().unwrap_or_default();
189 compressed.len() as f64 / content.len() as f64
190}
191
192#[derive(Debug, Clone, Copy, PartialEq, Eq)]
193pub enum CompressibilityClass {
194 High,
195 Medium,
196 Low,
197}
198
199impl CompressibilityClass {
200 pub fn label(&self) -> &'static str {
201 match self {
202 Self::High => "high (K<0.3)",
203 Self::Medium => "medium (0.3≤K<0.6)",
204 Self::Low => "low (K≥0.6)",
205 }
206 }
207}
208
209pub fn compressibility_class(content: &str) -> CompressibilityClass {
211 let k = kolmogorov_proxy(content);
212 if k < 0.3 {
213 CompressibilityClass::High
214 } else if k < 0.6 {
215 CompressibilityClass::Medium
216 } else {
217 CompressibilityClass::Low
218 }
219}
220
221pub fn entropy_compress(content: &str) -> EntropyResult {
222 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
223}
224
225pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
226 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
227 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard)
228}
229
230fn entropy_compress_with_thresholds(
231 content: &str,
232 entropy_threshold: f64,
233 jaccard_threshold: f64,
234) -> EntropyResult {
235 let original_tokens = count_tokens(content);
236 let mut lines: Vec<&str> = content.lines().collect();
237 let mut techniques = Vec::new();
238
239 let original_count = lines.len();
240 lines.retain(|line| {
241 let trimmed = line.trim();
242 if trimmed.is_empty() || trimmed.len() < 3 {
243 return true;
244 }
245 let h = token_entropy(trimmed);
246 if h >= entropy_threshold {
247 return true;
248 }
249 let h_norm = normalized_token_entropy(trimmed);
250 h_norm >= 0.3
251 });
252 let removed = original_count - lines.len();
253 if removed > 0 {
254 techniques.push(format!(
255 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
256 ));
257 }
258
259 let blocks = extract_blocks(&lines);
260 let groups = find_pattern_groups(&blocks, jaccard_threshold);
261 let mut dedup_count = 0;
262 for group in &groups {
263 if group.len() > 1 {
264 dedup_count += group.len() - 1;
265 }
266 }
267 if dedup_count > 0 {
268 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
269 }
270
271 let mut result: Vec<String> = Vec::new();
272 let mut skip_indices: HashSet<usize> = HashSet::new();
273 for group in &groups {
274 if group.len() > 1 {
275 for &idx in &group[1..] {
276 skip_indices.insert(idx);
277 }
278 }
279 }
280 for (i, line) in lines.iter().enumerate() {
281 if !skip_indices.contains(&i) {
282 result.push(line.to_string());
283 }
284 }
285
286 let mut collapsed = Vec::new();
287 let mut blank_count = 0;
288 for line in &result {
289 if line.trim().is_empty() {
290 blank_count += 1;
291 if blank_count <= 1 {
292 collapsed.push(line.clone());
293 }
294 } else {
295 blank_count = 0;
296 collapsed.push(line.clone());
297 }
298 }
299
300 let output = collapsed.join("\n");
301 let compressed_tokens = count_tokens(&output);
302
303 EntropyResult {
304 output,
305 original_tokens,
306 compressed_tokens,
307 techniques,
308 }
309}
310
311#[derive(Debug)]
312pub struct EntropyAnalysis {
313 pub avg_entropy: f64,
314 pub low_entropy_count: usize,
315 pub high_entropy_count: usize,
316 pub total_lines: usize,
317}
318
319pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
320 let lines: Vec<&str> = content.lines().collect();
321 let total = lines.len();
322 let mut sum = 0.0;
323 let mut low = 0;
324 let mut high = 0;
325 let mut counted = 0;
326
327 for line in &lines {
328 let trimmed = line.trim();
329 if trimmed.is_empty() {
330 continue;
331 }
332 let h = token_entropy(trimmed);
333 sum += h;
334 counted += 1;
335 if h < BPE_ENTROPY_THRESHOLD {
336 low += 1;
337 }
338 if h > 3.0 {
339 high += 1;
340 }
341 }
342
343 EntropyAnalysis {
344 avg_entropy: if counted > 0 {
345 sum / counted as f64
346 } else {
347 0.0
348 },
349 low_entropy_count: low,
350 high_entropy_count: high,
351 total_lines: total,
352 }
353}
354
355#[allow(dead_code)]
356struct Block {
357 start: usize,
358 content: String,
359}
360
361fn extract_blocks(lines: &[&str]) -> Vec<Block> {
362 let mut blocks = Vec::new();
363 let mut current = String::new();
364 let mut start = 0;
365
366 for (i, line) in lines.iter().enumerate() {
367 let trimmed = line.trim();
368 if trimmed.is_empty() && !current.is_empty() {
369 blocks.push(Block {
370 start,
371 content: current.clone(),
372 });
373 current.clear();
374 } else if !trimmed.is_empty() {
375 if current.is_empty() {
376 start = i;
377 }
378 current.push_str(trimmed);
379 current.push('\n');
380 }
381 }
382
383 if !current.is_empty() {
384 blocks.push(Block {
385 start,
386 content: current,
387 });
388 }
389
390 blocks
391}
392
393fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
394 let mut groups: Vec<Vec<usize>> = Vec::new();
395 let mut assigned: HashSet<usize> = HashSet::new();
396
397 for (i, block_a) in blocks.iter().enumerate() {
398 if assigned.contains(&i) {
399 continue;
400 }
401 let mut group = vec![i];
402 for (j, block_b) in blocks.iter().enumerate().skip(i + 1) {
403 if assigned.contains(&j) {
404 continue;
405 }
406 if ngram_jaccard(&block_a.content, &block_b.content, 2) >= threshold {
407 group.push(j);
408 assigned.insert(j);
409 }
410 }
411 if group.len() > 1 {
412 assigned.insert(i);
413 }
414 groups.push(group);
415 }
416
417 groups
418}
419
420#[cfg(test)]
421mod tests {
422 use super::*;
423
424 #[test]
425 fn shannon_entropy_empty_is_zero() {
426 assert_eq!(shannon_entropy(""), 0.0);
427 }
428
429 #[test]
430 fn shannon_entropy_single_char() {
431 assert_eq!(shannon_entropy("aaaa"), 0.0);
432 }
433
434 #[test]
435 fn shannon_entropy_high_for_varied_text() {
436 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
437 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
438 assert!(
439 shannon_entropy(varied) > shannon_entropy(uniform),
440 "varied text should have higher entropy"
441 );
442 }
443
444 #[test]
445 fn jaccard_identical_is_one() {
446 let sim = jaccard_similarity("hello world", "hello world");
447 assert!((sim - 1.0).abs() < f64::EPSILON);
448 }
449
450 #[test]
451 fn jaccard_disjoint_is_zero() {
452 let sim = jaccard_similarity("abc", "xyz");
453 assert_eq!(sim, 0.0);
454 }
455
456 #[test]
457 fn jaccard_partial_overlap() {
458 let sim = jaccard_similarity("hello world", "hello rust");
459 assert!(sim > 0.0 && sim < 1.0);
460 }
461
462 #[test]
463 fn entropy_compress_produces_output() {
464 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
465 let result = entropy_compress(content);
466 assert!(!result.output.is_empty(), "should produce non-empty output");
467 assert!(result.compressed_tokens <= result.original_tokens);
468 }
469
470 #[test]
471 fn entropy_result_savings() {
472 let r = EntropyResult {
473 output: "short".to_string(),
474 original_tokens: 100,
475 compressed_tokens: 60,
476 techniques: vec!["test".to_string()],
477 };
478 assert!((r.savings_percent() - 40.0).abs() < 0.1);
479 }
480
481 #[test]
482 fn entropy_result_zero_original() {
483 let r = EntropyResult {
484 output: String::new(),
485 original_tokens: 0,
486 compressed_tokens: 0,
487 techniques: vec![],
488 };
489 assert_eq!(r.savings_percent(), 0.0);
490 }
491
492 #[test]
493 fn token_entropy_empty_is_zero() {
494 assert_eq!(token_entropy(""), 0.0);
495 }
496
497 #[test]
498 fn token_entropy_single_repeated_token() {
499 assert_eq!(token_entropy("}"), 0.0);
500 }
501
502 #[test]
503 fn token_entropy_higher_for_diverse_code() {
504 let diverse = "let result = compute_something(x, y, z);";
505 let repetitive = "aaaa aaaa aaaa aaaa";
506 assert!(
507 token_entropy(diverse) > token_entropy(repetitive),
508 "diverse code should have higher BPE token entropy"
509 );
510 }
511
512 #[test]
513 fn token_entropy_vs_char_entropy_differ() {
514 let code = "fn main() { println!(\"hello world\"); }";
515 let te = token_entropy(code);
516 let ce = shannon_entropy(code);
517 assert!(te != ce, "BPE and char entropy should differ for code");
518 }
519
520 #[test]
521 fn ngram_jaccard_preserves_order() {
522 let a = "a b c d";
523 let b = "d c b a";
524 let word_j = jaccard_similarity(a, b);
525 let ngram_j = ngram_jaccard(a, b, 2);
526 assert!(
527 ngram_j < word_j,
528 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
529 );
530 }
531
532 #[test]
533 fn ngram_jaccard_identical_is_one() {
534 let text = "fn main() { println!(\"hello\"); }";
535 let j = ngram_jaccard(text, text, 2);
536 assert!((j - 1.0).abs() < f64::EPSILON);
537 }
538
539 #[test]
540 fn ngram_jaccard_disjoint_is_zero() {
541 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
542 assert_eq!(j, 0.0);
543 }
544
545 #[test]
546 fn minhash_approximates_jaccard() {
547 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
548 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
549 let exact = ngram_jaccard(a, b, 2);
550 let sig_a = minhash_signature(a, 2, MINHASH_NUM_HASHES);
551 let sig_b = minhash_signature(b, 2, MINHASH_NUM_HASHES);
552 let approx = minhash_similarity(&sig_a, &sig_b);
553 assert!(
554 (exact - approx).abs() < 0.2,
555 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
556 );
557 }
558
559 #[test]
560 fn minhash_empty_text() {
561 let sig = minhash_signature("", 2, 64);
562 assert!(sig.iter().all(|&v| v == u64::MAX));
563 }
564
565 #[test]
566 fn kolmogorov_empty_is_one() {
567 assert_eq!(kolmogorov_proxy(""), 1.0);
568 }
569
570 #[test]
571 fn kolmogorov_repetitive_is_low() {
572 let repetitive = "aaa\n".repeat(1000);
573 let k = kolmogorov_proxy(&repetitive);
574 assert!(
575 k < 0.1,
576 "highly repetitive text should compress well: K={k}"
577 );
578 }
579
580 #[test]
581 fn kolmogorov_diverse_is_higher() {
582 let repetitive = "aaa\n".repeat(500);
583 let diverse = (0..500)
584 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
585 .collect::<Vec<_>>()
586 .join("\n");
587 assert!(
588 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
589 "diverse content should have higher K than repetitive"
590 );
591 }
592
593 #[test]
594 fn compressibility_class_repetitive_is_high() {
595 let text = "use std::io;\n".repeat(200);
596 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
597 }
598
599 #[test]
600 fn kolmogorov_diverse_higher_than_repetitive() {
601 let rep = "test\n".repeat(500);
602 let diverse = (0..500)
603 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
604 .collect::<Vec<_>>()
605 .join("\n");
606 assert!(
607 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
608 "diverse content should have higher K"
609 );
610 }
611}