1use std::collections::{HashMap, HashSet};
2use std::hash::{Hash, Hasher};
3use std::io::Write;
4
5use flate2::write::GzEncoder;
6use flate2::Compression;
7
8use super::tokens::{count_tokens, encode_tokens};
9
10const BPE_ENTROPY_THRESHOLD: f64 = 1.0;
11#[allow(dead_code)]
12const MINHASH_NUM_HASHES: usize = 128;
13
14#[derive(Debug)]
15#[allow(dead_code)]
16pub struct EntropyResult {
17 pub output: String,
18 pub original_tokens: usize,
19 pub compressed_tokens: usize,
20 pub techniques: Vec<String>,
21}
22
23#[allow(dead_code)]
24impl EntropyResult {
25 pub fn savings_percent(&self) -> f64 {
26 if self.original_tokens == 0 {
27 return 0.0;
28 }
29 let saved = self.original_tokens.saturating_sub(self.compressed_tokens);
30 (saved as f64 / self.original_tokens as f64) * 100.0
31 }
32}
33
34#[allow(dead_code)]
35pub fn shannon_entropy(text: &str) -> f64 {
36 if text.is_empty() {
37 return 0.0;
38 }
39 let mut freq: HashMap<char, usize> = HashMap::new();
40 let total = text.chars().count();
41
42 for c in text.chars() {
43 *freq.entry(c).or_default() += 1;
44 }
45
46 freq.values().fold(0.0_f64, |acc, &count| {
47 let p = count as f64 / total as f64;
48 acc - p * p.log2()
49 })
50}
51
52pub fn token_entropy(text: &str) -> f64 {
55 let tokens = encode_tokens(text);
56 if tokens.is_empty() {
57 return 0.0;
58 }
59 let total = tokens.len();
60 let mut freq: HashMap<u32, usize> = HashMap::new();
61 for &t in &tokens {
62 *freq.entry(t).or_default() += 1;
63 }
64 freq.values().fold(0.0_f64, |acc, &count| {
65 let p = count as f64 / total as f64;
66 acc - p * p.log2()
67 })
68}
69
70#[allow(dead_code)]
71pub fn jaccard_similarity(a: &str, b: &str) -> f64 {
72 let set_a: HashSet<&str> = a.split_whitespace().collect();
73 let set_b: HashSet<&str> = b.split_whitespace().collect();
74
75 let intersection = set_a.intersection(&set_b).count();
76 let union = set_a.union(&set_b).count();
77
78 if union == 0 {
79 return 0.0;
80 }
81 intersection as f64 / union as f64
82}
83
84pub fn ngram_jaccard(a: &str, b: &str, n: usize) -> f64 {
86 let set_a = ngram_set(a, n);
87 let set_b = ngram_set(b, n);
88
89 let intersection = set_a.intersection(&set_b).count();
90 let union = set_a.union(&set_b).count();
91
92 if union == 0 {
93 return 0.0;
94 }
95 intersection as f64 / union as f64
96}
97
98fn ngram_set(text: &str, n: usize) -> HashSet<Vec<String>> {
99 let words: Vec<&str> = text.split_whitespace().collect();
100 if words.len() < n {
101 let mut set = HashSet::new();
102 if !words.is_empty() {
103 set.insert(words.iter().map(|w| w.to_string()).collect());
104 }
105 return set;
106 }
107 words
108 .windows(n)
109 .map(|w| w.iter().map(|s| s.to_string()).collect())
110 .collect()
111}
112
113#[allow(dead_code)]
116pub fn minhash_signature(text: &str, n: usize, k: usize) -> Vec<u64> {
117 let ngrams = ngram_set(text, n);
118 if ngrams.is_empty() {
119 return vec![u64::MAX; k];
120 }
121 let mut signature = vec![u64::MAX; k];
122 for ngram in &ngrams {
123 for (i, min) in signature.iter_mut().enumerate() {
124 let h = hash_with_seed(ngram, i as u64);
125 if h < *min {
126 *min = h;
127 }
128 }
129 }
130 signature
131}
132
133#[allow(dead_code)]
135pub fn minhash_similarity(sig_a: &[u64], sig_b: &[u64]) -> f64 {
136 if sig_a.len() != sig_b.len() || sig_a.is_empty() {
137 return 0.0;
138 }
139 let matches = sig_a
140 .iter()
141 .zip(sig_b.iter())
142 .filter(|(a, b)| a == b)
143 .count();
144 matches as f64 / sig_a.len() as f64
145}
146
147#[allow(dead_code)]
148fn hash_with_seed<T: Hash>(value: &T, seed: u64) -> u64 {
149 let mut hasher = std::collections::hash_map::DefaultHasher::new();
150 seed.hash(&mut hasher);
151 value.hash(&mut hasher);
152 hasher.finish()
153}
154
155pub fn kolmogorov_proxy(content: &str) -> f64 {
158 if content.is_empty() {
159 return 1.0;
160 }
161 let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
162 encoder.write_all(content.as_bytes()).ok();
163 let compressed = encoder.finish().unwrap_or_default();
164 compressed.len() as f64 / content.len() as f64
165}
166
167#[derive(Debug, Clone, Copy, PartialEq, Eq)]
168pub enum CompressibilityClass {
169 High,
170 Medium,
171 Low,
172}
173
174impl CompressibilityClass {
175 pub fn label(&self) -> &'static str {
176 match self {
177 Self::High => "high (K<0.3)",
178 Self::Medium => "medium (0.3≤K<0.6)",
179 Self::Low => "low (K≥0.6)",
180 }
181 }
182}
183
184pub fn compressibility_class(content: &str) -> CompressibilityClass {
186 let k = kolmogorov_proxy(content);
187 if k < 0.3 {
188 CompressibilityClass::High
189 } else if k < 0.6 {
190 CompressibilityClass::Medium
191 } else {
192 CompressibilityClass::Low
193 }
194}
195
196pub fn entropy_compress(content: &str) -> EntropyResult {
197 entropy_compress_with_thresholds(content, BPE_ENTROPY_THRESHOLD, 0.7)
198}
199
200pub fn entropy_compress_adaptive(content: &str, path: &str) -> EntropyResult {
201 let thresholds = super::adaptive_thresholds::adaptive_thresholds(path, content);
202 entropy_compress_with_thresholds(content, thresholds.bpe_entropy, thresholds.jaccard)
203}
204
205fn entropy_compress_with_thresholds(
206 content: &str,
207 entropy_threshold: f64,
208 jaccard_threshold: f64,
209) -> EntropyResult {
210 let original_tokens = count_tokens(content);
211 let mut lines: Vec<&str> = content.lines().collect();
212 let mut techniques = Vec::new();
213
214 let original_count = lines.len();
215 lines.retain(|line| {
216 let trimmed = line.trim();
217 if trimmed.is_empty() || trimmed.len() < 3 {
218 return true;
219 }
220 token_entropy(trimmed) >= entropy_threshold
221 });
222 let removed = original_count - lines.len();
223 if removed > 0 {
224 techniques.push(format!(
225 "⊘ {removed} low-entropy lines (BPE H<{entropy_threshold:.2})"
226 ));
227 }
228
229 let blocks = extract_blocks(&lines);
230 let groups = find_pattern_groups(&blocks, jaccard_threshold);
231 let mut dedup_count = 0;
232 for group in &groups {
233 if group.len() > 1 {
234 dedup_count += group.len() - 1;
235 }
236 }
237 if dedup_count > 0 {
238 techniques.push(format!("⊘ {dedup_count} duplicate patterns (J≥0.7)"));
239 }
240
241 let mut result: Vec<String> = Vec::new();
242 let mut skip_indices: HashSet<usize> = HashSet::new();
243 for group in &groups {
244 if group.len() > 1 {
245 for &idx in &group[1..] {
246 skip_indices.insert(idx);
247 }
248 }
249 }
250 for (i, line) in lines.iter().enumerate() {
251 if !skip_indices.contains(&i) {
252 result.push(line.to_string());
253 }
254 }
255
256 let mut collapsed = Vec::new();
257 let mut blank_count = 0;
258 for line in &result {
259 if line.trim().is_empty() {
260 blank_count += 1;
261 if blank_count <= 1 {
262 collapsed.push(line.clone());
263 }
264 } else {
265 blank_count = 0;
266 collapsed.push(line.clone());
267 }
268 }
269
270 let output = collapsed.join("\n");
271 let compressed_tokens = count_tokens(&output);
272
273 EntropyResult {
274 output,
275 original_tokens,
276 compressed_tokens,
277 techniques,
278 }
279}
280
281#[derive(Debug)]
282pub struct EntropyAnalysis {
283 pub avg_entropy: f64,
284 pub low_entropy_count: usize,
285 pub high_entropy_count: usize,
286 pub total_lines: usize,
287}
288
289pub fn analyze_entropy(content: &str) -> EntropyAnalysis {
290 let lines: Vec<&str> = content.lines().collect();
291 let total = lines.len();
292 let mut sum = 0.0;
293 let mut low = 0;
294 let mut high = 0;
295 let mut counted = 0;
296
297 for line in &lines {
298 let trimmed = line.trim();
299 if trimmed.is_empty() {
300 continue;
301 }
302 let h = token_entropy(trimmed);
303 sum += h;
304 counted += 1;
305 if h < BPE_ENTROPY_THRESHOLD {
306 low += 1;
307 }
308 if h > 3.0 {
309 high += 1;
310 }
311 }
312
313 EntropyAnalysis {
314 avg_entropy: if counted > 0 {
315 sum / counted as f64
316 } else {
317 0.0
318 },
319 low_entropy_count: low,
320 high_entropy_count: high,
321 total_lines: total,
322 }
323}
324
325#[allow(dead_code)]
326struct Block {
327 start: usize,
328 content: String,
329}
330
331fn extract_blocks(lines: &[&str]) -> Vec<Block> {
332 let mut blocks = Vec::new();
333 let mut current = String::new();
334 let mut start = 0;
335
336 for (i, line) in lines.iter().enumerate() {
337 let trimmed = line.trim();
338 if trimmed.is_empty() && !current.is_empty() {
339 blocks.push(Block {
340 start,
341 content: current.clone(),
342 });
343 current.clear();
344 } else if !trimmed.is_empty() {
345 if current.is_empty() {
346 start = i;
347 }
348 current.push_str(trimmed);
349 current.push('\n');
350 }
351 }
352
353 if !current.is_empty() {
354 blocks.push(Block {
355 start,
356 content: current,
357 });
358 }
359
360 blocks
361}
362
363fn find_pattern_groups(blocks: &[Block], threshold: f64) -> Vec<Vec<usize>> {
364 let mut groups: Vec<Vec<usize>> = Vec::new();
365 let mut assigned: HashSet<usize> = HashSet::new();
366
367 for (i, block_a) in blocks.iter().enumerate() {
368 if assigned.contains(&i) {
369 continue;
370 }
371 let mut group = vec![i];
372 for (j, block_b) in blocks.iter().enumerate().skip(i + 1) {
373 if assigned.contains(&j) {
374 continue;
375 }
376 if ngram_jaccard(&block_a.content, &block_b.content, 2) >= threshold {
377 group.push(j);
378 assigned.insert(j);
379 }
380 }
381 if group.len() > 1 {
382 assigned.insert(i);
383 }
384 groups.push(group);
385 }
386
387 groups
388}
389
390#[cfg(test)]
391mod tests {
392 use super::*;
393
394 #[test]
395 fn shannon_entropy_empty_is_zero() {
396 assert_eq!(shannon_entropy(""), 0.0);
397 }
398
399 #[test]
400 fn shannon_entropy_single_char() {
401 assert_eq!(shannon_entropy("aaaa"), 0.0);
402 }
403
404 #[test]
405 fn shannon_entropy_high_for_varied_text() {
406 let varied = "abcdefghijklmnopqrstuvwxyz0123456789";
407 let uniform = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
408 assert!(
409 shannon_entropy(varied) > shannon_entropy(uniform),
410 "varied text should have higher entropy"
411 );
412 }
413
414 #[test]
415 fn jaccard_identical_is_one() {
416 let sim = jaccard_similarity("hello world", "hello world");
417 assert!((sim - 1.0).abs() < f64::EPSILON);
418 }
419
420 #[test]
421 fn jaccard_disjoint_is_zero() {
422 let sim = jaccard_similarity("abc", "xyz");
423 assert_eq!(sim, 0.0);
424 }
425
426 #[test]
427 fn jaccard_partial_overlap() {
428 let sim = jaccard_similarity("hello world", "hello rust");
429 assert!(sim > 0.0 && sim < 1.0);
430 }
431
432 #[test]
433 fn entropy_compress_produces_output() {
434 let content = "fn main() {\n println!(\"hello\");\n}\n\n// comment\n// another comment\n\nfn helper() {\n let x = 42;\n}\n";
435 let result = entropy_compress(content);
436 assert!(!result.output.is_empty(), "should produce non-empty output");
437 assert!(result.compressed_tokens <= result.original_tokens);
438 }
439
440 #[test]
441 fn entropy_result_savings() {
442 let r = EntropyResult {
443 output: "short".to_string(),
444 original_tokens: 100,
445 compressed_tokens: 60,
446 techniques: vec!["test".to_string()],
447 };
448 assert!((r.savings_percent() - 40.0).abs() < 0.1);
449 }
450
451 #[test]
452 fn entropy_result_zero_original() {
453 let r = EntropyResult {
454 output: String::new(),
455 original_tokens: 0,
456 compressed_tokens: 0,
457 techniques: vec![],
458 };
459 assert_eq!(r.savings_percent(), 0.0);
460 }
461
462 #[test]
463 fn token_entropy_empty_is_zero() {
464 assert_eq!(token_entropy(""), 0.0);
465 }
466
467 #[test]
468 fn token_entropy_single_repeated_token() {
469 assert_eq!(token_entropy("}"), 0.0);
470 }
471
472 #[test]
473 fn token_entropy_higher_for_diverse_code() {
474 let diverse = "let result = compute_something(x, y, z);";
475 let repetitive = "aaaa aaaa aaaa aaaa";
476 assert!(
477 token_entropy(diverse) > token_entropy(repetitive),
478 "diverse code should have higher BPE token entropy"
479 );
480 }
481
482 #[test]
483 fn token_entropy_vs_char_entropy_differ() {
484 let code = "fn main() { println!(\"hello world\"); }";
485 let te = token_entropy(code);
486 let ce = shannon_entropy(code);
487 assert!(te != ce, "BPE and char entropy should differ for code");
488 }
489
490 #[test]
491 fn ngram_jaccard_preserves_order() {
492 let a = "a b c d";
493 let b = "d c b a";
494 let word_j = jaccard_similarity(a, b);
495 let ngram_j = ngram_jaccard(a, b, 2);
496 assert!(
497 ngram_j < word_j,
498 "reordered text should have lower bigram Jaccard ({ngram_j}) than word Jaccard ({word_j})"
499 );
500 }
501
502 #[test]
503 fn ngram_jaccard_identical_is_one() {
504 let text = "fn main() { println!(\"hello\"); }";
505 let j = ngram_jaccard(text, text, 2);
506 assert!((j - 1.0).abs() < f64::EPSILON);
507 }
508
509 #[test]
510 fn ngram_jaccard_disjoint_is_zero() {
511 let j = ngram_jaccard("alpha beta gamma", "delta epsilon zeta", 2);
512 assert_eq!(j, 0.0);
513 }
514
515 #[test]
516 fn minhash_approximates_jaccard() {
517 let a = "fn main() { let x = 1; let y = 2; let z = x + y; println!(z); }";
518 let b = "fn main() { let x = 1; let y = 2; let z = x + y; return z; }";
519 let exact = ngram_jaccard(a, b, 2);
520 let sig_a = minhash_signature(a, 2, MINHASH_NUM_HASHES);
521 let sig_b = minhash_signature(b, 2, MINHASH_NUM_HASHES);
522 let approx = minhash_similarity(&sig_a, &sig_b);
523 assert!(
524 (exact - approx).abs() < 0.2,
525 "minhash ({approx}) should approximate exact ({exact}) within 0.2"
526 );
527 }
528
529 #[test]
530 fn minhash_empty_text() {
531 let sig = minhash_signature("", 2, 64);
532 assert!(sig.iter().all(|&v| v == u64::MAX));
533 }
534
535 #[test]
536 fn kolmogorov_empty_is_one() {
537 assert_eq!(kolmogorov_proxy(""), 1.0);
538 }
539
540 #[test]
541 fn kolmogorov_repetitive_is_low() {
542 let repetitive = "aaa\n".repeat(1000);
543 let k = kolmogorov_proxy(&repetitive);
544 assert!(
545 k < 0.1,
546 "highly repetitive text should compress well: K={k}"
547 );
548 }
549
550 #[test]
551 fn kolmogorov_diverse_is_higher() {
552 let repetitive = "aaa\n".repeat(500);
553 let diverse = (0..500)
554 .map(|i| format!("line_{i}_unique_content_{}", i * 17 % 97))
555 .collect::<Vec<_>>()
556 .join("\n");
557 assert!(
558 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&repetitive),
559 "diverse content should have higher K than repetitive"
560 );
561 }
562
563 #[test]
564 fn compressibility_class_repetitive_is_high() {
565 let text = "use std::io;\n".repeat(200);
566 assert_eq!(compressibility_class(&text), CompressibilityClass::High);
567 }
568
569 #[test]
570 fn kolmogorov_diverse_higher_than_repetitive() {
571 let rep = "test\n".repeat(500);
572 let diverse = (0..500)
573 .map(|i| format!("unique_line_{i}_xk{}", i * 31 % 1000))
574 .collect::<Vec<_>>()
575 .join("\n");
576 assert!(
577 kolmogorov_proxy(&diverse) > kolmogorov_proxy(&rep),
578 "diverse content should have higher K"
579 );
580 }
581}