1use super::Match;
26
27#[inline]
40pub fn fast_entropy_estimate(data: &[u8]) -> f32 {
41 if data.len() < 16 {
42 return 4.0; }
44
45 const SAMPLE_SIZE: usize = 256;
47 let step = data.len() / SAMPLE_SIZE.min(data.len());
48 let step = step.max(1);
49
50 let mut freq = [0u16; 256];
52 let mut sample_count = 0u32;
53
54 let mut i = 0;
55 while i < data.len() && sample_count < SAMPLE_SIZE as u32 {
56 freq[data[i] as usize] += 1;
57 sample_count += 1;
58 i += step;
59 }
60
61 if sample_count == 0 {
62 return 4.0;
63 }
64
65 let n = sample_count as f32;
67 let mut entropy: f32 = 0.0;
68
69 for &f in &freq {
71 if f > 0 {
72 let p = f as f32 / n;
73 entropy -= p * p.log2();
74 }
75 }
76
77 entropy
78}
79
80#[inline]
92pub fn fast_should_compress(data: &[u8]) -> bool {
93 if data.len() < 32 {
94 return true; }
96
97 let entropy = fast_entropy_estimate(data);
99
100 entropy < 7.5
107}
108
109#[derive(Debug, Clone, Copy, PartialEq, Eq)]
111pub enum FastBlockType {
112 Raw,
114 Rle,
116 Compress,
118}
119
120#[inline]
124pub fn fast_predict_block_type(data: &[u8]) -> FastBlockType {
125 if data.len() < 4 {
126 return FastBlockType::Raw;
127 }
128
129 let first = data[0];
131 let is_uniform = data.iter().take(64.min(data.len())).all(|&b| b == first);
132 if is_uniform && data.len() >= 4 {
133 if data.len() <= 64 || {
135 let step = data.len() / 16;
136 (0..16).all(|i| data.get(i * step).copied() == Some(first))
137 } {
138 return FastBlockType::Rle;
139 }
140 }
141
142 let entropy = fast_entropy_estimate(data);
144
145 if entropy > 7.5 {
146 FastBlockType::Raw
147 } else {
148 FastBlockType::Compress
149 }
150}
151
152#[derive(Debug, Clone)]
154pub struct CompressibilityFingerprint {
155 pub entropy: f32,
157 pub pattern: PatternType,
159 pub estimated_ratio: f32,
161 pub strategy: CompressionStrategy,
163}
164
165#[derive(Debug, Clone, Copy, PartialEq, Eq)]
167pub enum PatternType {
168 Uniform,
170 LowEntropy,
172 Periodic { period: usize },
174 TextLike,
176 HighEntropy,
178 Random,
180}
181
182#[derive(Debug, Clone, Copy, PartialEq, Eq)]
184pub enum CompressionStrategy {
185 RleBlock,
187 RleSequences,
189 PredefinedFse,
191 RawBlock,
193}
194
195impl CompressibilityFingerprint {
196 pub fn analyze(data: &[u8]) -> Self {
200 if data.is_empty() {
201 return Self {
202 entropy: 0.0,
203 pattern: PatternType::Uniform,
204 estimated_ratio: 1.0,
205 strategy: CompressionStrategy::RawBlock,
206 };
207 }
208
209 #[cfg(feature = "simd")]
211 let freq = haagenti_simd::byte_histogram(data);
212
213 #[cfg(not(feature = "simd"))]
214 let freq = {
215 let mut f = [0u32; 256];
216 for &b in data {
217 f[b as usize] += 1;
218 }
219 f
220 };
221
222 let unique_bytes = freq.iter().filter(|&&f| f > 0).count();
224
225 if unique_bytes == 1 {
227 return Self {
228 entropy: 0.0,
229 pattern: PatternType::Uniform,
230 estimated_ratio: 0.01, strategy: CompressionStrategy::RleBlock,
232 };
233 }
234
235 let len = data.len() as f32;
237 let entropy: f32 = freq
238 .iter()
239 .filter(|&&f| f > 0)
240 .map(|&f| {
241 let p = f as f32 / len;
242 -p * p.log2()
243 })
244 .sum();
245
246 let period = detect_period(data);
248 let pattern = if let Some(p) = period {
249 PatternType::Periodic { period: p }
250 } else if entropy < 3.0 {
251 PatternType::LowEntropy
252 } else if entropy < 5.5 && is_text_like(&freq) {
253 PatternType::TextLike
254 } else if entropy > 7.5 {
255 PatternType::Random
256 } else {
257 PatternType::HighEntropy
258 };
259
260 let estimated_ratio = match pattern {
262 PatternType::Uniform => 0.01,
263 PatternType::Periodic { period } => 0.1 + (period as f32 * 0.02),
264 PatternType::LowEntropy => 0.3 + (entropy / 10.0),
265 PatternType::TextLike => 0.4 + (entropy / 20.0),
266 PatternType::HighEntropy => 0.8 + (entropy / 40.0),
267 PatternType::Random => 1.1, };
269
270 let has_runs = data.len() >= 8 && {
273 let mut runs = 0;
274 let mut i = 0;
275 while i < data.len().min(256) {
276 let start = i;
277 while i < data.len().min(256) && data[i] == data[start] {
278 i += 1;
279 }
280 if i - start >= 4 {
281 runs += 1;
282 }
283 }
284 runs >= 3 };
286
287 let strategy = match pattern {
289 PatternType::Uniform => CompressionStrategy::RleBlock,
290 PatternType::Periodic { period } if period <= 4 => CompressionStrategy::RleSequences,
291 PatternType::LowEntropy if unique_bytes <= 16 => CompressionStrategy::RleSequences,
292 PatternType::TextLike => CompressionStrategy::PredefinedFse,
293 PatternType::Periodic { .. } => CompressionStrategy::PredefinedFse,
294 PatternType::LowEntropy => CompressionStrategy::PredefinedFse,
295 PatternType::HighEntropy if has_runs => CompressionStrategy::PredefinedFse,
297 PatternType::HighEntropy if estimated_ratio < 0.95 => {
298 CompressionStrategy::PredefinedFse
299 }
300 _ => CompressionStrategy::RawBlock,
301 };
302
303 Self {
304 entropy,
305 pattern,
306 estimated_ratio,
307 strategy,
308 }
309 }
310
311 pub fn refine_with_matches(&mut self, data: &[u8], matches: &[Match]) {
315 if matches.is_empty() {
316 if self.strategy == CompressionStrategy::RleSequences
318 || self.strategy == CompressionStrategy::PredefinedFse
319 {
320 self.strategy = CompressionStrategy::RawBlock;
321 self.estimated_ratio = 1.05; }
323 return;
324 }
325
326 let matched_bytes: usize = matches.iter().map(|m| m.length).sum();
328 let coverage = matched_bytes as f32 / data.len() as f32;
329
330 let (uniform_offsets, uniform_lengths) = analyze_match_uniformity(matches);
332
333 if coverage > 0.5 && uniform_offsets && uniform_lengths {
335 self.strategy = CompressionStrategy::RleSequences;
337 self.estimated_ratio = 0.2 + (1.0 - coverage) * 0.5;
338 } else if coverage > 0.3 {
339 self.strategy = CompressionStrategy::PredefinedFse;
341 self.estimated_ratio = 0.4 + (1.0 - coverage) * 0.4;
342 } else if coverage < 0.1 {
343 self.strategy = CompressionStrategy::RawBlock;
345 self.estimated_ratio = 1.02;
346 }
347 }
348}
349
350fn detect_period(data: &[u8]) -> Option<usize> {
352 if data.len() < 8 {
353 return None;
354 }
355
356 for period in 1..=8.min(data.len() / 2) {
358 let mut matches = 0;
359 let mut checks = 0;
360
361 let step = (data.len() / 32).max(1);
363 let mut i = period;
364 while i < data.len() {
365 if data[i] == data[i - period] {
366 matches += 1;
367 }
368 checks += 1;
369 i += step;
370 }
371
372 if checks > 0 && matches as f32 / checks as f32 > 0.9 {
373 return Some(period);
374 }
375 }
376
377 None
378}
379
380fn is_text_like(freq: &[u32; 256]) -> bool {
382 let printable: u32 = freq[32..=126].iter().sum();
384 let total: u32 = freq.iter().sum();
385
386 if total == 0 {
387 return false;
388 }
389
390 let common_text = freq[32]
392 + freq[b'e' as usize]
393 + freq[b't' as usize]
394 + freq[b'a' as usize]
395 + freq[b'o' as usize]
396 + freq[b'n' as usize];
397
398 printable as f32 / total as f32 > 0.8 && common_text as f32 / total as f32 > 0.2
399}
400
401fn analyze_match_uniformity(matches: &[Match]) -> (bool, bool) {
403 if matches.len() < 2 {
404 return (true, true);
405 }
406
407 let first_offset = matches[0].offset;
408 let first_length = matches[0].length;
409
410 let uniform_offsets = matches.iter().all(|m| {
412 let diff = m.offset.abs_diff(first_offset);
413 diff <= 3 });
415
416 let uniform_lengths = matches.iter().all(|m| {
418 let diff = m.length.abs_diff(first_length);
419 diff <= 2 });
421
422 (uniform_offsets, uniform_lengths)
423}
424
425#[cfg(test)]
430mod tests {
431 use super::*;
432
433 #[test]
434 fn test_uniform_detection() {
435 let data = vec![b'A'; 100];
436 let fp = CompressibilityFingerprint::analyze(&data);
437
438 assert_eq!(fp.pattern, PatternType::Uniform);
439 assert_eq!(fp.strategy, CompressionStrategy::RleBlock);
440 assert!(fp.estimated_ratio < 0.1);
441 }
442
443 #[test]
444 fn test_random_detection() {
445 let data: Vec<u8> = (0u64..256)
447 .map(|i| {
448 let x = i.wrapping_mul(1103515245).wrapping_add(12345);
449 (x >> 16) as u8
450 })
451 .collect();
452
453 let fp = CompressibilityFingerprint::analyze(&data);
454 assert!(fp.entropy > 6.0, "Entropy was {}", fp.entropy);
455 }
456
457 #[test]
458 fn test_periodic_detection() {
459 let pattern = b"ABCD";
461 let data: Vec<u8> = pattern.iter().cycle().take(100).copied().collect();
462
463 let fp = CompressibilityFingerprint::analyze(&data);
464
465 if let PatternType::Periodic { period } = fp.pattern {
466 assert_eq!(period, 4);
467 } else {
468 panic!("Expected Periodic pattern, got {:?}", fp.pattern);
469 }
470 }
471
472 #[test]
473 fn test_text_like_detection() {
474 let data = b"The quick brown fox jumps over the lazy dog.";
475 let fp = CompressibilityFingerprint::analyze(data);
476
477 assert_eq!(fp.pattern, PatternType::TextLike);
478 }
479
480 #[test]
481 fn test_low_entropy() {
482 let mut data = Vec::new();
485 data.extend(std::iter::repeat(0u8).take(50));
486 data.extend(std::iter::repeat(1u8).take(30));
487 data.extend(std::iter::repeat(2u8).take(20));
488
489 let fp = CompressibilityFingerprint::analyze(&data);
490
491 assert!(fp.entropy < 2.0, "Entropy was {}", fp.entropy);
492 assert!(matches!(
494 fp.pattern,
495 PatternType::LowEntropy | PatternType::Periodic { .. }
496 ));
497 }
498
499 #[test]
500 fn test_empty_data() {
501 let fp = CompressibilityFingerprint::analyze(&[]);
502 assert_eq!(fp.strategy, CompressionStrategy::RawBlock);
503 }
504
505 #[test]
506 fn test_match_uniformity_analysis() {
507 let uniform_matches = vec![
508 Match::new(10, 5, 4),
509 Match::new(20, 5, 4),
510 Match::new(30, 5, 4),
511 ];
512
513 let (uniform_off, uniform_len) = analyze_match_uniformity(&uniform_matches);
514 assert!(uniform_off);
515 assert!(uniform_len);
516 }
517
518 #[test]
519 fn test_match_non_uniformity() {
520 let varied_matches = vec![
521 Match::new(10, 5, 4),
522 Match::new(20, 100, 20),
523 Match::new(50, 3, 3),
524 ];
525
526 let (uniform_off, uniform_len) = analyze_match_uniformity(&varied_matches);
527 assert!(!uniform_off);
528 assert!(!uniform_len);
529 }
530
531 #[test]
532 fn test_refine_with_no_matches() {
533 let data = b"unique data with no repetition xyz";
534 let mut fp = CompressibilityFingerprint::analyze(data);
535 fp.refine_with_matches(data, &[]);
536
537 assert_eq!(fp.strategy, CompressionStrategy::RawBlock);
538 }
539
540 #[test]
541 fn test_refine_with_good_matches() {
542 let data = b"abcdabcdabcdabcdabcdabcd";
543 let mut fp = CompressibilityFingerprint::analyze(data);
544
545 let matches = vec![
547 Match::new(4, 4, 4),
548 Match::new(8, 4, 4),
549 Match::new(12, 4, 4),
550 Match::new(16, 4, 4),
551 Match::new(20, 4, 4),
552 ];
553
554 fp.refine_with_matches(data, &matches);
555
556 assert_eq!(fp.strategy, CompressionStrategy::RleSequences);
558 }
559
560 #[test]
565 fn test_fast_entropy_estimate_zeros() {
566 let data = vec![0u8; 1000];
567 let entropy = fast_entropy_estimate(&data);
568 assert!(
569 entropy < 0.1,
570 "Zeros should have ~0 entropy, got {}",
571 entropy
572 );
573 }
574
575 #[test]
576 fn test_fast_entropy_estimate_random() {
577 let data: Vec<u8> = (0u64..1000)
579 .map(|i| {
580 let x = i.wrapping_mul(1103515245).wrapping_add(12345);
581 (x % 256) as u8
582 })
583 .collect();
584 let entropy = fast_entropy_estimate(&data);
585 assert!(
586 entropy > 7.0,
587 "Random should have high entropy, got {}",
588 entropy
589 );
590 }
591
592 #[test]
593 fn test_fast_entropy_estimate_text() {
594 let data = b"The quick brown fox jumps over the lazy dog. ";
595 let entropy = fast_entropy_estimate(data);
596 assert!(
598 entropy > 3.5 && entropy < 6.0,
599 "Text should have moderate entropy, got {}",
600 entropy
601 );
602 }
603
604 #[test]
605 fn test_fast_should_compress_compressible() {
606 let zeros = vec![0u8; 1000];
608 assert!(fast_should_compress(&zeros), "Zeros should be compressible");
609
610 let text = b"The quick brown fox jumps over the lazy dog. ";
611 assert!(fast_should_compress(text), "Text should be compressible");
612
613 let repeated = b"abcdefgh".repeat(100);
614 assert!(
615 fast_should_compress(&repeated),
616 "Repeated pattern should be compressible"
617 );
618 }
619
620 #[test]
621 fn test_fast_should_compress_incompressible() {
622 let random: Vec<u8> = (0u64..1000)
625 .map(|i| {
626 let x = i
628 .wrapping_mul(0x5851f42d4c957f2d)
629 .wrapping_add(0x14057b7ef767814f);
630 ((x >> 32) ^ x) as u8
631 })
632 .collect();
633
634 let should = fast_should_compress(&random);
637 println!(
638 "Random data should_compress: {} (entropy: {})",
639 should,
640 fast_entropy_estimate(&random)
641 );
642 }
643
644 #[test]
645 fn test_fast_predict_block_type_rle() {
646 let uniform = vec![b'X'; 1000];
647 assert_eq!(fast_predict_block_type(&uniform), FastBlockType::Rle);
648 }
649
650 #[test]
651 fn test_fast_predict_block_type_compress() {
652 let text = b"The quick brown fox jumps over the lazy dog repeatedly.";
653 assert_eq!(fast_predict_block_type(text), FastBlockType::Compress);
654 }
655
656 #[test]
657 fn test_fast_predict_block_type_raw_for_random() {
658 let data: Vec<u8> = (0..1000)
660 .map(|i| {
661 let x = (i as u64).wrapping_mul(0x5851f42d4c957f2d);
662 ((x >> 32) ^ x) as u8
663 })
664 .collect();
665
666 let block_type = fast_predict_block_type(&data);
667 println!(
669 "High entropy block type: {:?} (entropy: {})",
670 block_type,
671 fast_entropy_estimate(&data)
672 );
673 }
674
675 #[test]
676 fn test_fast_entropy_estimate_speed() {
677 let data = vec![0u8; 100_000];
679
680 let start = std::time::Instant::now();
681 for _ in 0..10_000 {
682 std::hint::black_box(fast_entropy_estimate(&data));
683 }
684 let elapsed = start.elapsed();
685
686 assert!(
688 elapsed.as_millis() < 100,
689 "fast_entropy_estimate too slow: {:?} for 10K iterations",
690 elapsed
691 );
692 }
693}