1mod analysis;
101mod arena;
102pub mod block;
103mod match_finder;
104mod repeat_offset_finder;
105mod sequences;
106pub mod speculative;
107
108pub use analysis::{
109 fast_entropy_estimate,
111 fast_predict_block_type,
112 fast_should_compress,
113 CompressibilityFingerprint,
114 CompressionStrategy,
115 FastBlockType,
116 PatternType,
117};
118pub use arena::{Arena, ArenaVec, DEFAULT_ARENA_SIZE};
119pub use block::{encode_block, BlockEncoder};
120pub use match_finder::{CacheAligned, LazyMatchFinder, Match, MatchFinder};
121pub use repeat_offset_finder::RepeatOffsetMatchFinder;
122pub use sequences::{
123 analyze_for_rle, encode_sequences_fse, encode_sequences_fse_with_encoded, encode_sequences_rle,
124 encode_sequences_with_custom_tables, EncodedSequence, RleSuitability,
125};
126pub use speculative::{SpeculativeCompressor, SpeculativeStrategy};
127
128use crate::frame::{xxhash64, ZSTD_MAGIC};
129use crate::{CustomFseTables, CustomHuffmanTable};
130use haagenti_core::{CompressionLevel, Result};
131
132#[derive(Debug)]
134enum MatchFinderVariant {
135 Greedy(MatchFinder),
137 Lazy(LazyMatchFinder),
139 RepeatAware(RepeatOffsetMatchFinder),
141}
142
143pub struct CompressContext {
145 #[allow(dead_code)] level: CompressionLevel,
148 match_finder: MatchFinderVariant,
150 dictionary_id: Option<u32>,
152 arena: Arena,
155 custom_tables: Option<CustomFseTables>,
157 custom_huffman: Option<CustomHuffmanTable>,
159}
160
161impl core::fmt::Debug for CompressContext {
163 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
164 f.debug_struct("CompressContext")
165 .field("level", &self.level)
166 .field("match_finder", &self.match_finder)
167 .field("dictionary_id", &self.dictionary_id)
168 .field("arena_usage", &self.arena.usage())
169 .field("arena_capacity", &self.arena.capacity())
170 .field("has_custom_tables", &self.custom_tables.is_some())
171 .field("has_custom_huffman", &self.custom_huffman.is_some())
172 .finish()
173 }
174}
175
176impl CompressContext {
177 pub fn new(level: CompressionLevel) -> Self {
179 let search_depth = match level {
185 CompressionLevel::None => 1,
186 CompressionLevel::Fast => 8, CompressionLevel::Default => 24, CompressionLevel::Best => 48, CompressionLevel::Ultra => 128, CompressionLevel::Custom(n) => n.min(255) as usize,
191 };
192
193 let match_finder = match level {
198 CompressionLevel::None => MatchFinderVariant::Greedy(MatchFinder::new(search_depth)),
199 CompressionLevel::Best | CompressionLevel::Ultra => {
200 MatchFinderVariant::RepeatAware(RepeatOffsetMatchFinder::new(search_depth))
201 }
202 _ => MatchFinderVariant::Lazy(LazyMatchFinder::new(search_depth)),
203 };
204
205 Self {
206 level,
207 match_finder,
208 dictionary_id: None,
209 arena: Arena::with_default_size(),
210 custom_tables: None,
211 custom_huffman: None,
212 }
213 }
214
215 pub fn with_custom_tables(level: CompressionLevel, custom_tables: CustomFseTables) -> Self {
221 let mut ctx = Self::new(level);
222 ctx.custom_tables = Some(custom_tables);
223 ctx
224 }
225
226 pub fn with_options(
231 level: CompressionLevel,
232 custom_tables: Option<CustomFseTables>,
233 custom_huffman: Option<CustomHuffmanTable>,
234 ) -> Self {
235 let mut ctx = Self::new(level);
236 ctx.custom_tables = custom_tables;
237 ctx.custom_huffman = custom_huffman;
238 ctx
239 }
240
241 pub fn with_arena_size(level: CompressionLevel, arena_size: usize) -> Self {
246 let mut ctx = Self::new(level);
247 ctx.arena = Arena::new(arena_size);
248 ctx
249 }
250
251 pub fn custom_tables(&self) -> Option<&CustomFseTables> {
253 self.custom_tables.as_ref()
254 }
255
256 pub fn custom_huffman(&self) -> Option<&CustomHuffmanTable> {
258 self.custom_huffman.as_ref()
259 }
260
261 pub fn arena_peak_usage(&self) -> usize {
263 self.arena.peak_usage()
264 }
265
266 pub fn set_dictionary_id(&mut self, dict_id: u32) {
268 self.dictionary_id = Some(dict_id);
269 }
270
271 fn find_matches(&mut self, input: &[u8]) -> Vec<Match> {
277 match &mut self.match_finder {
278 MatchFinderVariant::Greedy(mf) => mf.find_matches(input),
279 MatchFinderVariant::Lazy(mf) => mf.find_matches_auto(input),
281 MatchFinderVariant::RepeatAware(mf) => mf.find_matches(input),
283 }
284 }
285
286 pub fn compress(&mut self, input: &[u8]) -> Result<Vec<u8>> {
291 self.compress_internal(input, false)
292 }
293
294 #[allow(dead_code)]
298 pub fn compress_with_checksum(&mut self, input: &[u8]) -> Result<Vec<u8>> {
299 self.compress_internal(input, true)
300 }
301
302 #[allow(dead_code)]
318 pub fn compress_speculative(&self, input: &[u8]) -> Result<Vec<u8>> {
319 let compressor = speculative::SpeculativeCompressor::new();
320 compressor.compress(input)
321 }
322
323 #[allow(dead_code)]
327 pub fn compress_speculative_fast(&self, input: &[u8]) -> Result<Vec<u8>> {
328 let compressor = speculative::SpeculativeCompressor::fast();
329 compressor.compress(input)
330 }
331
332 #[allow(dead_code)]
336 pub fn compress_speculative_best(&self, input: &[u8]) -> Result<Vec<u8>> {
337 let compressor = speculative::SpeculativeCompressor::best();
338 compressor.compress(input)
339 }
340
341 fn compress_internal(&mut self, input: &[u8], include_checksum: bool) -> Result<Vec<u8>> {
343 self.arena.reset();
346
347 let mut output = Vec::with_capacity(input.len() + 32);
348
349 output.extend_from_slice(&ZSTD_MAGIC.to_le_bytes());
351
352 self.write_frame_header(input.len(), include_checksum, &mut output)?;
354
355 self.encode_blocks(input, &mut output)?;
357
358 if include_checksum {
360 let checksum = (xxhash64(input, 0) & 0xFFFFFFFF) as u32;
361 output.extend_from_slice(&checksum.to_le_bytes());
362 }
363
364 Ok(output)
365 }
366
367 pub fn arena(&self) -> &Arena {
372 &self.arena
373 }
374
375 fn write_frame_header(
381 &self,
382 content_size: usize,
383 include_checksum: bool,
384 output: &mut Vec<u8>,
385 ) -> Result<()> {
386 let checksum_flag = if include_checksum { 0x04 } else { 0x00 };
389
390 let window_log = if content_size == 0 {
401 19u8 } else {
403 let log = (content_size as f64).log2().ceil() as u8;
404 log.clamp(19, 30) };
406
407 let (fcs_size, descriptor) = if content_size > 131071 {
410 (4, 0x80u8 | checksum_flag)
412 } else {
413 (0, checksum_flag)
416 };
417
418 output.push(descriptor);
419
420 let window_descriptor = (window_log - 10) << 3;
422 output.push(window_descriptor);
423
424 if fcs_size == 4 {
426 output.extend_from_slice(&(content_size as u32).to_le_bytes());
427 }
428
429 Ok(())
430 }
431
432 fn encode_blocks(&mut self, input: &[u8], output: &mut Vec<u8>) -> Result<()> {
434 const MAX_BLOCK_SIZE: usize = 128 * 1024 - 1; if input.is_empty() {
437 let header = 1u32; output.push((header & 0xFF) as u8);
440 output.push(((header >> 8) & 0xFF) as u8);
441 output.push(((header >> 16) & 0xFF) as u8);
442 return Ok(());
443 }
444
445 let mut pos = 0;
446 while pos < input.len() {
447 let remaining = input.len() - pos;
448 let block_size = remaining.min(MAX_BLOCK_SIZE);
449 let is_last = pos + block_size >= input.len();
450
451 let block_data = &input[pos..pos + block_size];
452 self.encode_single_block(block_data, is_last, output)?;
453
454 pos += block_size;
455 }
456
457 Ok(())
458 }
459
460 fn encode_single_block(
462 &mut self,
463 input: &[u8],
464 is_last: bool,
465 output: &mut Vec<u8>,
466 ) -> Result<()> {
467 let (block_type, encoded) = self.select_block_encoding(input)?;
469
470 let mut header = if is_last { 1u32 } else { 0u32 };
472 header |= (block_type as u32) << 1;
473
474 let size = match block_type {
475 0 => input.len(), 1 => input.len(), 2 => encoded.len(), _ => unreachable!(),
479 };
480
481 header |= (size as u32) << 3;
482
483 output.push((header & 0xFF) as u8);
484 output.push(((header >> 8) & 0xFF) as u8);
485 output.push(((header >> 16) & 0xFF) as u8);
486
487 output.extend_from_slice(&encoded);
489
490 Ok(())
491 }
492
493 fn select_block_encoding(&mut self, input: &[u8]) -> Result<(u8, Vec<u8>)> {
500 if input.len() >= 4096 {
502 return self.select_block_encoding_fast(input);
503 }
504
505 let fingerprint = analysis::CompressibilityFingerprint::analyze(input);
507
508 if fingerprint.strategy == CompressionStrategy::RleBlock {
510 return Ok((1, vec![input[0]]));
511 }
512
513 let matches = self.find_matches(input);
520
521 let mut fingerprint = fingerprint;
523 fingerprint.refine_with_matches(input, &matches);
524
525 match fingerprint.strategy {
527 CompressionStrategy::RleBlock => Ok((1, vec![input[0]])),
528 CompressionStrategy::RawBlock => {
529 let compressed = self.encode_literals_only(input)?;
530 if compressed.len() < input.len() {
531 Ok((2, compressed))
532 } else {
533 Ok((0, input.to_vec()))
534 }
535 }
536 CompressionStrategy::RleSequences | CompressionStrategy::PredefinedFse => {
537 let compressed = self.encode_with_rle_sequences(input, &matches)?;
538 if compressed.len() < input.len() {
539 Ok((2, compressed))
540 } else {
541 Ok((0, input.to_vec()))
542 }
543 }
544 }
545 }
546
547 fn select_block_encoding_fast(&mut self, input: &[u8]) -> Result<(u8, Vec<u8>)> {
552 let first = input[0];
554 let is_uniform = input.len() >= 64 && {
555 let first_uniform = input[..64].iter().all(|&b| b == first);
557 if first_uniform {
558 let step = input.len() / 32;
560 (1..32).all(|i| input.get(i * step).copied() == Some(first))
561 } else {
562 false
563 }
564 };
565
566 if is_uniform {
567 return Ok((1, vec![first]));
568 }
569
570 let matches = self.find_matches(input);
577
578 let compressed = self.encode_with_rle_sequences(input, &matches)?;
580 if compressed.len() < input.len() {
581 Ok((2, compressed))
582 } else {
583 Ok((0, input.to_vec()))
584 }
585 }
586
587 fn encode_with_rle_sequences(&mut self, input: &[u8], matches: &[Match]) -> Result<Vec<u8>> {
594 let mut output = Vec::with_capacity(input.len());
596
597 let (literals, seqs) = block::matches_to_sequences(input, matches);
599
600 if seqs.is_empty() {
601 self.encode_literals_with_options(input, &mut output)?;
603 block::encode_sequences(&[], &mut output)?;
604 } else {
605 let suitability = sequences::analyze_for_rle(&seqs);
607
608 self.encode_literals_with_options(&literals, &mut output)?;
610
611 if let Some(custom_tables) = &self.custom_tables {
613 sequences::encode_sequences_with_custom_tables(
614 &suitability.encoded,
615 custom_tables,
616 &mut output,
617 )?;
618 } else {
619 sequences::encode_sequences_fse_with_encoded(&suitability.encoded, &mut output)?;
622 }
623 }
624
625 Ok(output)
626 }
627
628 fn encode_literals_with_options(&self, literals: &[u8], output: &mut Vec<u8>) -> Result<()> {
630 if let Some(custom_huffman) = &self.custom_huffman {
631 block::encode_literals_with_encoder(literals, custom_huffman.encoder(), output)
632 } else {
633 block::encode_literals(literals, output)
634 }
635 }
636
637 fn encode_literals_only(&mut self, input: &[u8]) -> Result<Vec<u8>> {
639 let mut output = Vec::with_capacity(input.len());
640
641 self.encode_literals_with_options(input, &mut output)?;
643
644 block::encode_sequences(&[], &mut output)?;
646
647 Ok(output)
648 }
649}
650
651#[cfg(test)]
656mod tests {
657 use super::*;
658
659 #[test]
660 fn test_compress_context_creation() {
661 let ctx = CompressContext::new(CompressionLevel::Default);
662 assert_eq!(ctx.level, CompressionLevel::Default);
663 }
664
665 #[test]
666 fn test_compress_empty() {
667 let mut ctx = CompressContext::new(CompressionLevel::Fast);
668 let result = ctx.compress(&[]).unwrap();
669
670 assert!(
673 result.len() >= 4 + 2 + 3,
674 "expected at least 9 bytes, got {}",
675 result.len()
676 );
677
678 assert_eq!(&result[0..4], &[0x28, 0xB5, 0x2F, 0xFD]);
680 }
681
682 #[test]
683 fn test_compress_small() {
684 let mut ctx = CompressContext::new(CompressionLevel::Fast);
685 let input = b"Hello, World!";
686 let result = ctx.compress(input).unwrap();
687
688 assert_eq!(&result[0..4], &[0x28, 0xB5, 0x2F, 0xFD]);
690 }
691
692 #[test]
693 fn test_compress_rle_detection() {
694 let mut ctx = CompressContext::new(CompressionLevel::Fast);
695 let input = vec![b'A'; 100];
696 let result = ctx.compress(&input).unwrap();
697
698 assert!(result.len() < input.len());
700 }
701
702 #[test]
703 fn test_compression_levels() {
704 for level in [
705 CompressionLevel::Fast,
706 CompressionLevel::Default,
707 CompressionLevel::Best,
708 ] {
709 let mut ctx = CompressContext::new(level);
710 let input = b"Test compression with different levels";
711 let result = ctx.compress(input);
712 assert!(result.is_ok());
713 }
714 }
715
716 #[test]
721 fn test_text_compression_ratio_baseline() {
722 let data = b"The quick brown fox jumps over the lazy dog. ".repeat(1000);
724
725 let mut ctx = CompressContext::new(CompressionLevel::Default);
726 let compressed = ctx.compress(&data).unwrap();
727
728 let ratio = data.len() as f64 / compressed.len() as f64;
729
730 assert!(
732 ratio >= 2.5,
733 "Text compression ratio {:.2}x below baseline 2.5x",
734 ratio
735 );
736 }
737
738 #[test]
739 fn test_repetitive_data_compression_ratio() {
740 let data = b"abcdabcdabcdabcd".repeat(5000);
742
743 let mut ctx = CompressContext::new(CompressionLevel::Default);
744 let compressed = ctx.compress(&data).unwrap();
745
746 let ratio = data.len() as f64 / compressed.len() as f64;
747
748 assert!(
750 ratio >= 50.0,
751 "Repetitive data ratio {:.2}x below expected 50x",
752 ratio
753 );
754 }
755
756 #[test]
757 fn test_compression_levels_ratio_ordering() {
758 let data = b"This is test data that will be compressed at different levels. ".repeat(500);
760
761 let mut fast_ctx = CompressContext::new(CompressionLevel::Fast);
762 let mut default_ctx = CompressContext::new(CompressionLevel::Default);
763 let mut best_ctx = CompressContext::new(CompressionLevel::Best);
764
765 let fast = fast_ctx.compress(&data).unwrap();
766 let default = default_ctx.compress(&data).unwrap();
767 let best = best_ctx.compress(&data).unwrap();
768
769 assert!(
771 best.len() <= default.len(),
772 "Best ({}) should be <= Default ({})",
773 best.len(),
774 default.len()
775 );
776 assert!(
777 default.len() <= fast.len() + 50, "Default ({}) should be close to or better than Fast ({})",
779 default.len(),
780 fast.len()
781 );
782 }
783
784 #[test]
785 fn test_adaptive_search_depth_performance() {
786 let small_data = vec![b'a'; 1000];
788 let large_data = vec![b'a'; 100_000];
789
790 let mut ctx_small = CompressContext::new(CompressionLevel::Fast);
792 let mut ctx_large = CompressContext::new(CompressionLevel::Fast);
793
794 let start = std::time::Instant::now();
795 let _ = ctx_small.compress(&small_data).unwrap();
796 let small_time = start.elapsed();
797
798 let start = std::time::Instant::now();
799 let _ = ctx_large.compress(&large_data).unwrap();
800 let large_time = start.elapsed();
801
802 let time_ratio = large_time.as_nanos() as f64 / small_time.as_nanos().max(1) as f64;
804 assert!(
805 time_ratio < 200.0,
806 "Time ratio {:.1}x exceeds expected linear scaling",
807 time_ratio
808 );
809 }
810
811 #[test]
812 fn test_lazy_match_finder_produces_matches() {
813 let data = b"abcdefabcdefabcdef".repeat(100);
815
816 let mut lazy_mf = LazyMatchFinder::new(24);
817 let matches = lazy_mf.find_matches(&data);
818
819 assert!(
821 !matches.is_empty(),
822 "Lazy match finder should find matches in repetitive data"
823 );
824
825 for m in &matches {
827 assert!(m.offset > 0, "Match offset should be positive");
828 assert!(m.length >= 3, "Match length should be at least 3");
829 }
830 }
831
832 #[test]
833 fn test_long_distance_pattern_detection() {
834 let pattern = b"This is a distinctive pattern.";
836 let mut data = Vec::new();
837 data.extend_from_slice(pattern);
838 data.extend_from_slice(&vec![b'x'; 10_000]); data.extend_from_slice(pattern); let mut ctx = CompressContext::new(CompressionLevel::Default);
842 let compressed = ctx.compress(&data).unwrap();
843
844 assert!(
847 compressed.len() < data.len() - 20,
848 "Should find long-distance pattern match"
849 );
850 }
851
852 #[test]
853 fn test_entropy_based_strategy_selection() {
854 let low_entropy = vec![0u8; 100_000];
856
857 let mut ctx = CompressContext::new(CompressionLevel::Fast);
858 let compressed = ctx.compress(&low_entropy).unwrap();
859
860 let ratio = low_entropy.len() as f64 / compressed.len() as f64;
862 assert!(
863 ratio > 1000.0,
864 "Low entropy ratio {:.0}x should be >1000x",
865 ratio
866 );
867 }
868
869 #[test]
870 fn test_mixed_content_compression() {
871 let mut data = Vec::new();
873 data.extend_from_slice(b"Compressible repeated text. ".repeat(50).as_slice());
874 data.extend_from_slice(&(0u8..=255u8).cycle().take(1000).collect::<Vec<u8>>());
875 data.extend_from_slice(b"More compressible text here. ".repeat(50).as_slice());
876
877 let mut ctx = CompressContext::new(CompressionLevel::Default);
878 let compressed = ctx.compress(&data).unwrap();
879
880 assert!(
882 compressed.len() < data.len(),
883 "Mixed content should still compress"
884 );
885 }
886
887 #[test]
888 fn test_speculative_compression_available() {
889 let data = b"Test data for speculative compression. ".repeat(100);
891
892 let ctx = CompressContext::new(CompressionLevel::Default);
893 let result = ctx.compress_speculative(&data);
894
895 assert!(result.is_ok(), "Speculative compression should work");
896 assert!(
897 result.unwrap().len() < data.len(),
898 "Should produce compression"
899 );
900 }
901
902 #[test]
903 fn test_compression_with_checksum() {
904 let data = b"Data to compress with checksum verification.".repeat(100);
905
906 let mut ctx = CompressContext::new(CompressionLevel::Default);
907 let with_checksum = ctx.compress_with_checksum(&data).unwrap();
908 let without_checksum = ctx.compress(&data).unwrap();
909
910 assert_eq!(
912 with_checksum.len(),
913 without_checksum.len() + 4,
914 "Checksum adds 4 bytes"
915 );
916 }
917
918 #[test]
919 fn test_arena_reuse_efficiency() {
920 let mut ctx = CompressContext::new(CompressionLevel::Default);
921
922 for i in 0..5 {
924 let data = format!("Test arena reuse iteration {}. ", i).repeat(1000);
925 let result = ctx.compress(data.as_bytes()).unwrap();
926 assert!(!result.is_empty());
928 }
929
930 let capacity = ctx.arena().capacity();
932 assert!(capacity > 0, "Arena should have capacity");
933 }
934
935 #[test]
936 fn test_block_size_handling() {
937 let large_data = vec![b'A'; 150_000]; let mut ctx = CompressContext::new(CompressionLevel::Fast);
941 let compressed = ctx.compress(&large_data).unwrap();
942
943 assert!(!compressed.is_empty());
945 assert!(
947 compressed.len() < large_data.len(),
948 "Large RLE data should compress well"
949 );
950 }
951
952 #[test]
953 fn test_match_finder_variant_selection() {
954 let data = b"test pattern for matching variants".repeat(100);
956
957 let mut none_ctx = CompressContext::new(CompressionLevel::None);
959 let none_result = none_ctx.compress(&data);
960 assert!(none_result.is_ok());
961
962 let mut best_ctx = CompressContext::new(CompressionLevel::Best);
964 let best_result = best_ctx.compress(&data);
965 assert!(best_result.is_ok());
966
967 assert!(
969 best_result.unwrap().len() <= none_result.unwrap().len(),
970 "Best should be at least as good as None"
971 );
972 }
973
974 #[test]
975 fn test_repeat_offset_match_finder() {
976 let data = b"abcdefghijklmnopabcdefghijklmnopabcdefghijklmnop".repeat(50);
978
979 let mut mf = RepeatOffsetMatchFinder::new(48);
980 let matches = mf.find_matches(&data);
981
982 assert!(
984 !matches.is_empty(),
985 "RepeatAware should find matches in repetitive data"
986 );
987 }
988
989 #[test]
990 fn test_compression_fingerprint_accuracy() {
991 let uniform_data = vec![42u8; 1000];
993 let fp_uniform = analysis::CompressibilityFingerprint::analyze(&uniform_data);
994 assert_eq!(
995 fp_uniform.pattern,
996 PatternType::Uniform,
997 "Should detect uniform pattern"
998 );
999
1000 let text_data = b"The quick brown fox. ".repeat(100);
1001 let fp_text = analysis::CompressibilityFingerprint::analyze(&text_data);
1002 assert!(
1003 matches!(
1004 fp_text.pattern,
1005 PatternType::TextLike | PatternType::Periodic { .. } | PatternType::LowEntropy
1006 ),
1007 "Should detect text-like or periodic pattern, got {:?}",
1008 fp_text.pattern
1009 );
1010 }
1011
1012 #[test]
1013 fn test_fast_entropy_estimate() {
1014 let low_entropy = vec![0u8; 10000];
1016 let est_low = fast_entropy_estimate(&low_entropy);
1017 assert!(est_low < 1.0, "Low entropy data should have low estimate");
1018
1019 let high_entropy: Vec<u8> = (0..10000).map(|i| (i % 256) as u8).collect();
1020 let est_high = fast_entropy_estimate(&high_entropy);
1021 assert!(
1022 est_high > 5.0,
1023 "High entropy data should have high estimate: {}",
1024 est_high
1025 );
1026 }
1027
1028 #[test]
1029 fn test_fast_predict_block_type() {
1030 let zeros = vec![0u8; 1000];
1032 assert_eq!(
1033 fast_predict_block_type(&zeros),
1034 FastBlockType::Rle,
1035 "Uniform data should predict RLE"
1036 );
1037
1038 let text = b"Hello world! ".repeat(100);
1039 let predicted = fast_predict_block_type(&text);
1040 assert!(
1041 predicted == FastBlockType::Compress || predicted == FastBlockType::Raw,
1042 "Text should predict Compress or Raw"
1043 );
1044 }
1045}