1#![cfg_attr(test, allow(unused_imports))]
3#![cfg_attr(test, allow(unused_variables))]
4#![cfg_attr(test, allow(unused_mut))]
5#![cfg_attr(test, allow(clippy::int_plus_one))]
6#![cfg_attr(test, allow(clippy::precedence))]
7#![cfg_attr(test, allow(clippy::unnecessary_unwrap))]
8#![cfg_attr(test, allow(clippy::slow_vector_initialization))]
9#![cfg_attr(test, allow(clippy::manual_repeat_n))]
10#![cfg_attr(test, allow(clippy::len_zero))]
11#![cfg_attr(test, allow(clippy::manual_range_contains))]
12#![cfg_attr(test, allow(clippy::identity_op))]
13#![cfg_attr(test, allow(clippy::needless_range_loop))]
14#![cfg_attr(test, allow(clippy::assertions_on_constants))]
15#![cfg_attr(test, allow(clippy::same_item_push))]
16#![cfg_attr(test, allow(clippy::if_same_then_else))]
17#![cfg_attr(test, allow(clippy::expect_fun_call))]
18#![cfg_attr(test, allow(clippy::redundant_slicing))]
19#![cfg_attr(test, allow(clippy::collapsible_else_if))]
20#![cfg_attr(test, allow(clippy::redundant_closure))]
21#![cfg_attr(test, allow(clippy::manual_div_ceil))]
22#![cfg_attr(test, allow(clippy::useless_vec))]
23
24pub mod block;
155pub mod compress;
156pub mod decompress;
157pub mod dictionary;
158pub mod frame;
159pub mod fse;
160pub mod huffman;
161
162#[cfg(test)]
163mod perf_tests;
164
165pub use dictionary::{ZstdDictCompressor, ZstdDictDecompressor, ZstdDictionary};
166
167use haagenti_core::{
168 Algorithm, Codec, CompressionLevel, CompressionStats, Compressor, Decompressor, Error, Result,
169};
170
171pub const ZSTD_MAGIC: u32 = 0xFD2FB528;
177
178pub const MAX_WINDOW_SIZE: usize = 1 << 27;
180
181pub const MIN_WINDOW_SIZE: usize = 1 << 10;
183
184use fse::FseTable;
189use huffman::HuffmanEncoder;
190use std::sync::Arc;
191
192#[derive(Debug, Clone)]
212pub struct CustomHuffmanTable {
213 encoder: Arc<HuffmanEncoder>,
215}
216
217impl CustomHuffmanTable {
218 pub fn new(encoder: HuffmanEncoder) -> Self {
220 Self {
221 encoder: Arc::new(encoder),
222 }
223 }
224
225 pub fn encoder(&self) -> &HuffmanEncoder {
227 &self.encoder
228 }
229}
230
231#[derive(Debug, Clone, Default)]
254pub struct CustomFseTables {
255 pub ll_table: Option<Arc<FseTable>>,
257 pub of_table: Option<Arc<FseTable>>,
259 pub ml_table: Option<Arc<FseTable>>,
261}
262
263impl CustomFseTables {
264 pub fn new() -> Self {
266 Self::default()
267 }
268
269 pub fn with_ll_table(mut self, table: FseTable) -> Self {
271 self.ll_table = Some(Arc::new(table));
272 self
273 }
274
275 pub fn with_of_table(mut self, table: FseTable) -> Self {
277 self.of_table = Some(Arc::new(table));
278 self
279 }
280
281 pub fn with_ml_table(mut self, table: FseTable) -> Self {
283 self.ml_table = Some(Arc::new(table));
284 self
285 }
286
287 pub fn has_custom_tables(&self) -> bool {
289 self.ll_table.is_some() || self.of_table.is_some() || self.ml_table.is_some()
290 }
291}
292
293#[derive(Debug, Clone)]
317pub struct ZstdCompressor {
318 level: CompressionLevel,
319 custom_tables: Option<CustomFseTables>,
321 custom_huffman: Option<CustomHuffmanTable>,
323}
324
325impl ZstdCompressor {
326 pub fn new() -> Self {
328 Self {
329 level: CompressionLevel::Default,
330 custom_tables: None,
331 custom_huffman: None,
332 }
333 }
334
335 pub fn with_level(level: CompressionLevel) -> Self {
337 Self {
338 level,
339 custom_tables: None,
340 custom_huffman: None,
341 }
342 }
343
344 pub fn with_custom_tables(custom_tables: CustomFseTables) -> Self {
356 Self {
357 level: CompressionLevel::Default,
358 custom_tables: Some(custom_tables),
359 custom_huffman: None,
360 }
361 }
362
363 pub fn with_custom_huffman(custom_huffman: CustomHuffmanTable) -> Self {
368 Self {
369 level: CompressionLevel::Default,
370 custom_tables: None,
371 custom_huffman: Some(custom_huffman),
372 }
373 }
374
375 pub fn with_level_and_tables(level: CompressionLevel, custom_tables: CustomFseTables) -> Self {
377 Self {
378 level,
379 custom_tables: Some(custom_tables),
380 custom_huffman: None,
381 }
382 }
383
384 pub fn with_all_options(
386 level: CompressionLevel,
387 custom_tables: Option<CustomFseTables>,
388 custom_huffman: Option<CustomHuffmanTable>,
389 ) -> Self {
390 Self {
391 level,
392 custom_tables,
393 custom_huffman,
394 }
395 }
396
397 pub fn custom_tables(&self) -> Option<&CustomFseTables> {
399 self.custom_tables.as_ref()
400 }
401
402 pub fn custom_huffman(&self) -> Option<&CustomHuffmanTable> {
404 self.custom_huffman.as_ref()
405 }
406}
407
408impl Default for ZstdCompressor {
409 fn default() -> Self {
410 Self::new()
411 }
412}
413
414impl Compressor for ZstdCompressor {
415 fn algorithm(&self) -> Algorithm {
416 Algorithm::Zstd
417 }
418
419 fn level(&self) -> CompressionLevel {
420 self.level
421 }
422
423 fn compress(&self, input: &[u8]) -> Result<Vec<u8>> {
424 let mut ctx = compress::CompressContext::with_options(
425 self.level,
426 self.custom_tables.clone(),
427 self.custom_huffman.clone(),
428 );
429 ctx.compress(input)
430 }
431
432 fn compress_to(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
433 let compressed = self.compress(input)?;
434 if compressed.len() > output.len() {
435 return Err(Error::buffer_too_small(output.len(), compressed.len()));
436 }
437 output[..compressed.len()].copy_from_slice(&compressed);
438 Ok(compressed.len())
439 }
440
441 fn max_compressed_size(&self, input_len: usize) -> usize {
442 input_len + (input_len >> 7) + 512
444 }
445
446 fn stats(&self) -> Option<CompressionStats> {
447 None
448 }
449}
450
451#[derive(Debug, Clone, Default)]
455pub struct ZstdDecompressor;
456
457impl ZstdDecompressor {
458 pub fn new() -> Self {
460 Self
461 }
462}
463
464impl Decompressor for ZstdDecompressor {
465 fn algorithm(&self) -> Algorithm {
466 Algorithm::Zstd
467 }
468
469 fn decompress(&self, input: &[u8]) -> Result<Vec<u8>> {
470 decompress::decompress_frame(input)
471 }
472
473 fn decompress_to(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
474 let result = self.decompress(input)?;
475 if result.len() > output.len() {
476 return Err(Error::buffer_too_small(output.len(), result.len()));
477 }
478 output[..result.len()].copy_from_slice(&result);
479 Ok(result.len())
480 }
481
482 fn stats(&self) -> Option<CompressionStats> {
483 None
484 }
485}
486
487#[derive(Debug, Clone)]
489pub struct ZstdCodec {
490 level: CompressionLevel,
491}
492
493impl ZstdCodec {
494 pub fn new() -> Self {
496 Self {
497 level: CompressionLevel::Default,
498 }
499 }
500
501 pub fn with_level(level: CompressionLevel) -> Self {
503 Self { level }
504 }
505}
506
507impl Default for ZstdCodec {
508 fn default() -> Self {
509 Self::new()
510 }
511}
512
513impl Compressor for ZstdCodec {
514 fn algorithm(&self) -> Algorithm {
515 Algorithm::Zstd
516 }
517
518 fn level(&self) -> CompressionLevel {
519 self.level
520 }
521
522 fn compress(&self, input: &[u8]) -> Result<Vec<u8>> {
523 ZstdCompressor::with_level(self.level).compress(input)
524 }
525
526 fn compress_to(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
527 ZstdCompressor::with_level(self.level).compress_to(input, output)
528 }
529
530 fn max_compressed_size(&self, input_len: usize) -> usize {
531 ZstdCompressor::new().max_compressed_size(input_len)
532 }
533
534 fn stats(&self) -> Option<CompressionStats> {
535 None
536 }
537}
538
539impl Decompressor for ZstdCodec {
540 fn algorithm(&self) -> Algorithm {
541 Algorithm::Zstd
542 }
543
544 fn decompress(&self, input: &[u8]) -> Result<Vec<u8>> {
545 ZstdDecompressor::new().decompress(input)
546 }
547
548 fn decompress_to(&self, input: &[u8], output: &mut [u8]) -> Result<usize> {
549 ZstdDecompressor::new().decompress_to(input, output)
550 }
551
552 fn stats(&self) -> Option<CompressionStats> {
553 None
554 }
555}
556
557impl Codec for ZstdCodec {
558 fn new() -> Self {
559 ZstdCodec::new()
560 }
561
562 fn with_level(level: CompressionLevel) -> Self {
563 ZstdCodec::with_level(level)
564 }
565}
566
567#[cfg(test)]
572mod tests {
573 use super::*;
574
575 #[test]
576 fn test_magic_number() {
577 assert_eq!(ZSTD_MAGIC, 0xFD2FB528);
578 }
579
580 #[test]
581 fn test_decompressor_validates_magic() {
582 let decompressor = ZstdDecompressor::new();
583
584 let invalid_data = [0x00, 0x00, 0x00, 0x00, 0x00];
586 let result = decompressor.decompress(&invalid_data);
587 assert!(result.is_err());
588
589 let valid_magic = [0x28, 0xB5, 0x2F, 0xFD, 0x00];
591 let result = decompressor.decompress(&valid_magic);
592 assert!(result.is_err()); }
594
595 #[test]
596 fn test_too_short_input() {
597 let decompressor = ZstdDecompressor::new();
598 let result = decompressor.decompress(&[0x28, 0xB5]);
599 assert!(result.is_err());
600 }
601
602 #[test]
603 fn test_compressor_works() {
604 let compressor = ZstdCompressor::new();
605 let result = compressor.compress(b"test");
606 assert!(result.is_ok());
607
608 let compressed = result.unwrap();
610 assert_eq!(&compressed[0..4], &[0x28, 0xB5, 0x2F, 0xFD]);
611 }
612
613 #[test]
614 fn test_max_compressed_size() {
615 let compressor = ZstdCompressor::new();
616
617 assert!(compressor.max_compressed_size(100) > 100);
619
620 let large_max = compressor.max_compressed_size(1_000_000);
622 assert!(large_max > 1_000_000);
623 assert!(large_max < 1_100_000); }
625
626 #[test]
627 fn test_codec_algorithm() {
628 let codec = ZstdCodec::new();
629 assert_eq!(Compressor::algorithm(&codec), Algorithm::Zstd);
630 assert_eq!(Decompressor::algorithm(&codec), Algorithm::Zstd);
631 }
632
633 #[test]
634 fn test_compression_levels() {
635 for level in [
636 CompressionLevel::Fast,
637 CompressionLevel::Default,
638 CompressionLevel::Best,
639 ] {
640 let compressor = ZstdCompressor::with_level(level);
641 assert_eq!(compressor.level(), level);
642 }
643 }
644
645 #[test]
646 fn test_decompressor_raw_block() {
647 let mut frame = vec![];
649
650 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
652
653 frame.push(0x20);
655
656 frame.push(5);
658
659 frame.extend_from_slice(&[0x29, 0x00, 0x00]);
661
662 frame.extend_from_slice(b"Hello");
664
665 let decompressor = ZstdDecompressor::new();
666 let result = decompressor.decompress(&frame).unwrap();
667 assert_eq!(result, b"Hello");
668 }
669
670 #[test]
671 fn test_decompressor_rle_block() {
672 let mut frame = vec![];
673
674 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
676
677 frame.push(0x20);
679
680 frame.push(10);
682
683 frame.extend_from_slice(&[0x53, 0x00, 0x00]);
685
686 frame.push(b'X');
688
689 let decompressor = ZstdDecompressor::new();
690 let result = decompressor.decompress(&frame).unwrap();
691 assert_eq!(result, vec![b'X'; 10]);
692 }
693
694 #[test]
695 fn test_decompressor_multi_block() {
696 let mut frame = vec![];
697
698 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
700
701 frame.push(0x20);
703
704 frame.push(8);
706
707 frame.extend_from_slice(&[0x28, 0x00, 0x00]);
709 frame.extend_from_slice(b"Hello");
710
711 frame.extend_from_slice(&[0x19, 0x00, 0x00]);
713 frame.extend_from_slice(b"!!!");
714
715 let decompressor = ZstdDecompressor::new();
716 let result = decompressor.decompress(&frame).unwrap();
717 assert_eq!(result, b"Hello!!!");
718 }
719
720 #[test]
721 fn test_decompressor_with_checksum() {
722 use crate::frame::xxhash64;
723
724 let mut frame = vec![];
725
726 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
728
729 frame.push(0x24);
731
732 frame.push(5);
734
735 frame.extend_from_slice(&[0x29, 0x00, 0x00]);
737 frame.extend_from_slice(b"Hello");
738
739 let hash = xxhash64(b"Hello", 0);
741 let checksum = (hash & 0xFFFFFFFF) as u32;
742 frame.extend_from_slice(&checksum.to_le_bytes());
743
744 let decompressor = ZstdDecompressor::new();
745 let result = decompressor.decompress(&frame).unwrap();
746 assert_eq!(result, b"Hello");
747 }
748
749 #[test]
750 fn test_decompress_to() {
751 let mut frame = vec![];
752 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
753 frame.push(0x20);
754 frame.push(5);
755 frame.extend_from_slice(&[0x29, 0x00, 0x00]);
756 frame.extend_from_slice(b"Hello");
757
758 let decompressor = ZstdDecompressor::new();
759 let mut output = vec![0u8; 10];
760 let len = decompressor.decompress_to(&frame, &mut output).unwrap();
761
762 assert_eq!(len, 5);
763 assert_eq!(&output[..5], b"Hello");
764 }
765
766 #[test]
767 fn test_decompress_to_buffer_too_small() {
768 let mut frame = vec![];
769 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
770 frame.push(0x20);
771 frame.push(5);
772 frame.extend_from_slice(&[0x29, 0x00, 0x00]);
773 frame.extend_from_slice(b"Hello");
774
775 let decompressor = ZstdDecompressor::new();
776 let mut output = vec![0u8; 2]; let result = decompressor.decompress_to(&frame, &mut output);
778 assert!(result.is_err());
779 }
780
781 fn build_frame(
787 content_size: Option<u64>,
788 has_checksum: bool,
789 blocks: Vec<(bool, u8, Vec<u8>)>, ) -> Vec<u8> {
791 let mut frame = vec![];
792
793 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
795
796 let mut descriptor = 0u8;
798 if has_checksum {
799 descriptor |= 0x04; }
801
802 let fcs_bytes = match content_size {
804 None => 0,
805 Some(s) if s <= 255 => {
806 descriptor |= 0x20; 1
808 }
809 Some(s) if s <= 65791 => {
810 descriptor |= 0x40; 2
812 }
813 Some(s) if s <= 0xFFFFFFFF => {
814 descriptor |= 0x80; 4
816 }
817 Some(_) => {
818 descriptor |= 0xC0; 8
820 }
821 };
822
823 frame.push(descriptor);
824
825 if descriptor & 0x20 == 0 && content_size.is_some() {
827 frame.push(0x00); }
829
830 if let Some(size) = content_size {
832 match fcs_bytes {
833 1 => frame.push(size as u8),
834 2 => {
835 let adjusted = size.saturating_sub(256) as u16;
836 frame.extend_from_slice(&adjusted.to_le_bytes());
837 }
838 4 => frame.extend_from_slice(&(size as u32).to_le_bytes()),
839 8 => frame.extend_from_slice(&size.to_le_bytes()),
840 _ => {}
841 }
842 }
843
844 let mut decompressed_content = Vec::new();
846 for (is_last, block_type, data) in blocks {
847 let _compressed_size = if block_type == 1 { 1 } else { data.len() };
848 let decompressed_size = if block_type == 1 {
849 data.len()
850 } else {
851 data.len()
852 };
853
854 let mut header = if is_last { 1u32 } else { 0u32 };
856 header |= (block_type as u32) << 1;
857 header |= (decompressed_size as u32) << 3;
858
859 frame.push((header & 0xFF) as u8);
860 frame.push(((header >> 8) & 0xFF) as u8);
861 frame.push(((header >> 16) & 0xFF) as u8);
862
863 if block_type == 1 {
865 frame.push(data[0]);
867 for _ in 0..decompressed_size {
868 decompressed_content.push(data[0]);
869 }
870 } else {
871 frame.extend_from_slice(&data);
872 decompressed_content.extend_from_slice(&data);
873 }
874 }
875
876 if has_checksum {
878 let hash = crate::frame::xxhash64(&decompressed_content, 0);
879 let checksum = (hash & 0xFFFFFFFF) as u32;
880 frame.extend_from_slice(&checksum.to_le_bytes());
881 }
882
883 frame
884 }
885
886 #[test]
887 fn test_integration_empty_frame() {
888 let frame = build_frame(
890 Some(0),
891 false,
892 vec![
893 (true, 0, vec![]), ],
895 );
896
897 let decompressor = ZstdDecompressor::new();
898 let result = decompressor.decompress(&frame).unwrap();
899 assert!(result.is_empty());
900 }
901
902 #[test]
903 fn test_integration_multiple_raw_blocks() {
904 let frame = build_frame(
906 Some(15),
907 true,
908 vec![
909 (false, 0, b"Hello".to_vec()),
910 (false, 0, b", ".to_vec()),
911 (true, 0, b"World!!!".to_vec()),
912 ],
913 );
914
915 let decompressor = ZstdDecompressor::new();
916 let result = decompressor.decompress(&frame).unwrap();
917 assert_eq!(result, b"Hello, World!!!");
918 }
919
920 #[test]
921 fn test_integration_mixed_raw_rle() {
922 let mut frame = vec![];
925 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]); frame.push(0x24); frame.push(11); let header1 = (5 << 3) | (0 << 1) | 0; frame.push((header1 & 0xFF) as u8);
932 frame.push(((header1 >> 8) & 0xFF) as u8);
933 frame.push(((header1 >> 16) & 0xFF) as u8);
934 frame.extend_from_slice(b"Start");
935
936 let header2 = (3 << 3) | (1 << 1) | 0; frame.push((header2 & 0xFF) as u8);
939 frame.push(((header2 >> 8) & 0xFF) as u8);
940 frame.push(((header2 >> 16) & 0xFF) as u8);
941 frame.push(b'-');
942
943 let header3 = (3 << 3) | (0 << 1) | 1; frame.push((header3 & 0xFF) as u8);
946 frame.push(((header3 >> 8) & 0xFF) as u8);
947 frame.push(((header3 >> 16) & 0xFF) as u8);
948 frame.extend_from_slice(b"End");
949
950 let content = b"Start---End";
952 let hash = crate::frame::xxhash64(content, 0);
953 let checksum = (hash & 0xFFFFFFFF) as u32;
954 frame.extend_from_slice(&checksum.to_le_bytes());
955
956 let decompressor = ZstdDecompressor::new();
957 let result = decompressor.decompress(&frame).unwrap();
958 assert_eq!(result, b"Start---End");
959 }
960
961 #[test]
962 fn test_integration_large_rle() {
963 let mut frame = vec![];
965 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
966 frame.push(0x20); frame.push(200); let header = (200 << 3) | (1 << 1) | 1;
971 frame.push((header & 0xFF) as u8);
972 frame.push(((header >> 8) & 0xFF) as u8);
973 frame.push(((header >> 16) & 0xFF) as u8);
974 frame.push(b'X');
975
976 let decompressor = ZstdDecompressor::new();
977 let result = decompressor.decompress(&frame).unwrap();
978 assert_eq!(result.len(), 200);
979 assert!(result.iter().all(|&b| b == b'X'));
980 }
981
982 #[test]
983 fn test_integration_two_byte_fcs() {
984 let size = 300usize;
986 let data: Vec<u8> = (0..size).map(|i| (i % 256) as u8).collect();
987
988 let mut frame = vec![];
989 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
990
991 frame.push(0x40);
993
994 frame.push(0x00);
996
997 let fcs_value = (size - 256) as u16;
999 frame.extend_from_slice(&fcs_value.to_le_bytes());
1000
1001 let header = (size << 3) | 1; frame.push((header & 0xFF) as u8);
1004 frame.push(((header >> 8) & 0xFF) as u8);
1005 frame.push(((header >> 16) & 0xFF) as u8);
1006 frame.extend_from_slice(&data);
1007
1008 let decompressor = ZstdDecompressor::new();
1009 let result = decompressor.decompress(&frame).unwrap();
1010 assert_eq!(result.len(), size);
1011 assert_eq!(result, data);
1012 }
1013
1014 #[test]
1015 fn test_integration_binary_data() {
1016 let data: Vec<u8> = (0..=255).collect();
1018
1019 let mut frame = vec![];
1020 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
1021
1022 frame.push(0x40);
1024 frame.push(0x00); frame.extend_from_slice(&0u16.to_le_bytes());
1028
1029 let header = (256 << 3) | 1;
1031 frame.push((header & 0xFF) as u8);
1032 frame.push(((header >> 8) & 0xFF) as u8);
1033 frame.push(((header >> 16) & 0xFF) as u8);
1034 frame.extend_from_slice(&data);
1035
1036 let decompressor = ZstdDecompressor::new();
1037 let result = decompressor.decompress(&frame).unwrap();
1038 assert_eq!(result, data);
1039 }
1040
1041 #[test]
1042 fn test_integration_checksum_verification() {
1043 let data = b"Test data for checksum verification!";
1045
1046 let mut frame = vec![];
1047 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
1048 frame.push(0x24); frame.push(data.len() as u8);
1050
1051 let header = (data.len() << 3) | 1;
1052 frame.push((header & 0xFF) as u8);
1053 frame.push(((header >> 8) & 0xFF) as u8);
1054 frame.push(((header >> 16) & 0xFF) as u8);
1055 frame.extend_from_slice(data);
1056
1057 let hash = crate::frame::xxhash64(data, 0);
1059 let checksum = (hash & 0xFFFFFFFF) as u32;
1060 frame.extend_from_slice(&checksum.to_le_bytes());
1061
1062 let decompressor = ZstdDecompressor::new();
1063 let result = decompressor.decompress(&frame).unwrap();
1064 assert_eq!(result, data);
1065 }
1066
1067 #[test]
1068 fn test_integration_invalid_checksum_rejected() {
1069 let data = b"Test data";
1070
1071 let mut frame = vec![];
1072 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
1073 frame.push(0x24);
1074 frame.push(data.len() as u8);
1075
1076 let header = (data.len() << 3) | 1;
1077 frame.push((header & 0xFF) as u8);
1078 frame.push(((header >> 8) & 0xFF) as u8);
1079 frame.push(((header >> 16) & 0xFF) as u8);
1080 frame.extend_from_slice(data);
1081
1082 frame.extend_from_slice(&[0xDE, 0xAD, 0xBE, 0xEF]);
1084
1085 let decompressor = ZstdDecompressor::new();
1086 let result = decompressor.decompress(&frame);
1087 assert!(result.is_err());
1088 }
1089
1090 #[test]
1091 fn test_integration_content_size_mismatch_rejected() {
1092 let data = b"Short";
1093
1094 let mut frame = vec![];
1095 frame.extend_from_slice(&[0x28, 0xB5, 0x2F, 0xFD]);
1096 frame.push(0x20);
1097 frame.push(100); let header = (data.len() << 3) | 1;
1100 frame.push((header & 0xFF) as u8);
1101 frame.push(((header >> 8) & 0xFF) as u8);
1102 frame.push(((header >> 16) & 0xFF) as u8);
1103 frame.extend_from_slice(data);
1104
1105 let decompressor = ZstdDecompressor::new();
1106 let result = decompressor.decompress(&frame);
1107 assert!(result.is_err());
1108 }
1109
1110 #[test]
1115 fn test_roundtrip_empty() {
1116 let compressor = ZstdCompressor::new();
1117 let decompressor = ZstdDecompressor::new();
1118
1119 let input: &[u8] = &[];
1120 let compressed = compressor.compress(input).unwrap();
1121 let decompressed = decompressor.decompress(&compressed).unwrap();
1122
1123 assert_eq!(decompressed, input);
1124 }
1125
1126 #[test]
1127 fn test_roundtrip_small() {
1128 let compressor = ZstdCompressor::new();
1129 let decompressor = ZstdDecompressor::new();
1130
1131 let input = b"Hello, World!";
1132 let compressed = compressor.compress(input).unwrap();
1133 let decompressed = decompressor.decompress(&compressed).unwrap();
1134
1135 assert_eq!(decompressed, input);
1136 }
1137
1138 #[test]
1139 fn test_roundtrip_rle() {
1140 let compressor = ZstdCompressor::new();
1141 let decompressor = ZstdDecompressor::new();
1142
1143 let input = vec![b'A'; 100];
1144 let compressed = compressor.compress(&input).unwrap();
1145 let decompressed = decompressor.decompress(&compressed).unwrap();
1146
1147 assert_eq!(decompressed, input);
1148 assert!(compressed.len() < input.len());
1150 }
1151
1152 #[test]
1153 fn test_roundtrip_binary() {
1154 let compressor = ZstdCompressor::new();
1155 let decompressor = ZstdDecompressor::new();
1156
1157 let input: Vec<u8> = (0..=255).collect();
1158 let compressed = compressor.compress(&input).unwrap();
1159 let decompressed = decompressor.decompress(&compressed).unwrap();
1160
1161 assert_eq!(decompressed, input);
1162 }
1163
1164 #[test]
1165 fn test_roundtrip_repeated_pattern() {
1166 let compressor = ZstdCompressor::new();
1167 let decompressor = ZstdDecompressor::new();
1168
1169 let pattern = b"0123456789ABCDEF";
1171 let mut input = Vec::new();
1172 for _ in 0..10 {
1173 input.extend_from_slice(pattern);
1174 }
1175
1176 let compressed = compressor.compress(&input).unwrap();
1177 let decompressed = decompressor.decompress(&compressed).unwrap();
1178
1179 assert_eq!(decompressed, input);
1180 }
1181
1182 #[test]
1183 fn test_roundtrip_compression_levels() {
1184 let decompressor = ZstdDecompressor::new();
1185 let input = b"Test data for compression level testing. This needs to be long enough to trigger actual compression.";
1186
1187 for level in [
1188 CompressionLevel::None,
1189 CompressionLevel::Fast,
1190 CompressionLevel::Default,
1191 CompressionLevel::Best,
1192 ] {
1193 let compressor = ZstdCompressor::with_level(level);
1194 let compressed = compressor.compress(input).unwrap();
1195 let decompressed = decompressor.decompress(&compressed).unwrap();
1196
1197 assert_eq!(
1198 decompressed, input,
1199 "Roundtrip failed for level {:?}",
1200 level
1201 );
1202 }
1203 }
1204
1205 #[test]
1206 fn test_codec_roundtrip() {
1207 let codec = ZstdCodec::new();
1208 let input = b"Testing the codec roundtrip functionality";
1209
1210 let compressed = Compressor::compress(&codec, input).unwrap();
1211 let decompressed = Decompressor::decompress(&codec, &compressed).unwrap();
1212
1213 assert_eq!(decompressed, input);
1214 }
1215
1216 #[test]
1221 fn test_roundtrip_uniform_pattern() {
1222 let compressor = ZstdCompressor::new();
1224 let decompressor = ZstdDecompressor::new();
1225
1226 let input = b"abcdabcdabcdabcdabcdabcdabcdabcd";
1228 let compressed = compressor.compress(input).unwrap();
1229 let decompressed = decompressor.decompress(&compressed).unwrap();
1230
1231 assert_eq!(decompressed, input);
1232 }
1233
1234 #[test]
1235 fn test_roundtrip_longer_uniform_pattern() {
1236 let compressor = ZstdCompressor::new();
1237 let decompressor = ZstdDecompressor::new();
1238
1239 let pattern = b"Hello World! ";
1241 let mut input = Vec::new();
1242 for _ in 0..20 {
1243 input.extend_from_slice(pattern);
1244 }
1245
1246 let compressed = compressor.compress(&input).unwrap();
1247 let decompressed = decompressor.decompress(&compressed).unwrap();
1248
1249 assert_eq!(decompressed, input);
1250 assert!(compressed.len() < input.len());
1252 }
1253
1254 #[test]
1255 fn test_roundtrip_overlapping_matches() {
1256 let compressor = ZstdCompressor::new();
1257 let decompressor = ZstdDecompressor::new();
1258
1259 let input = b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
1262
1263 let compressed = compressor.compress(input).unwrap();
1264 let decompressed = decompressor.decompress(&compressed).unwrap();
1265
1266 assert_eq!(decompressed, input);
1267 assert!(compressed.len() < input.len() / 2);
1269 }
1270
1271 #[test]
1272 fn test_roundtrip_mixed_patterns() {
1273 let compressor = ZstdCompressor::new();
1274 let decompressor = ZstdDecompressor::new();
1275
1276 let mut input = Vec::new();
1278 input.extend_from_slice(b"prefix_");
1279 for _ in 0..10 {
1280 input.extend_from_slice(b"pattern_");
1281 }
1282 input.extend_from_slice(b"suffix");
1283
1284 let compressed = compressor.compress(&input).unwrap();
1285 let decompressed = decompressor.decompress(&compressed).unwrap();
1286
1287 assert_eq!(decompressed, input);
1288 }
1289
1290 #[test]
1291 fn test_roundtrip_single_byte_repeats() {
1292 let compressor = ZstdCompressor::new();
1293 let decompressor = ZstdDecompressor::new();
1294
1295 let mut input = Vec::new();
1297 for _ in 0..10 {
1298 input.extend(vec![b'X'; 20]);
1299 input.extend(vec![b'Y'; 20]);
1300 }
1301
1302 let compressed = compressor.compress(&input).unwrap();
1303 let decompressed = decompressor.decompress(&compressed).unwrap();
1304
1305 assert_eq!(decompressed, input);
1306 }
1308
1309 #[test]
1310 fn test_roundtrip_various_pattern_lengths() {
1311 let compressor = ZstdCompressor::new();
1312 let decompressor = ZstdDecompressor::new();
1313
1314 for pattern_len in 3..=8 {
1316 let pattern: Vec<u8> = (0..pattern_len).map(|i| b'A' + i).collect();
1317 let mut input = Vec::new();
1318 for _ in 0..20 {
1319 input.extend_from_slice(&pattern);
1320 }
1321
1322 let compressed = compressor.compress(&input).unwrap();
1323 let decompressed = decompressor.decompress(&compressed).unwrap();
1324
1325 assert_eq!(
1326 decompressed, input,
1327 "Failed for pattern length {}",
1328 pattern_len
1329 );
1330 }
1331 }
1332
1333 #[test]
1334 fn test_roundtrip_llm_weights_pattern() {
1335 let compressor = ZstdCompressor::new();
1338 let decompressor = ZstdDecompressor::new();
1339
1340 let f16_patterns: &[u16] = &[
1343 0x0000, 0x1400, 0x9400, 0x2000, 0xA000, 0x2E00, 0xAE00, 0x3800, 0xB800, ];
1353
1354 for size in [1024, 4096] {
1355 let mut input = Vec::with_capacity(size);
1356 let mut idx = 0;
1357 while input.len() < size {
1358 let val = f16_patterns[idx % f16_patterns.len()];
1359 input.extend_from_slice(&val.to_le_bytes());
1360 idx += 1;
1361 }
1362 input.truncate(size);
1363
1364 let compressed = compressor.compress(&input).unwrap();
1365 eprintln!(
1366 "Size {}: input={} bytes, compressed={} bytes",
1367 size,
1368 input.len(),
1369 compressed.len()
1370 );
1371
1372 let block_data = &compressed[11..]; let lit_byte0 = block_data[0];
1375 let lit_type = lit_byte0 & 0x03;
1376 let size_format = (lit_byte0 >> 2) & 0x03;
1377 eprintln!("Literals: type={}, size_format={}", lit_type, size_format);
1378
1379 if lit_type == 2 && size_format == 2 {
1380 let regen = ((block_data[0] >> 4) as usize)
1382 | ((block_data[1] as usize) << 4)
1383 | (((block_data[2] & 0x0F) as usize) << 12);
1384 let comp = ((block_data[2] >> 4) as usize)
1385 | ((block_data[3] as usize) << 4)
1386 | (((block_data[4] & 0x03) as usize) << 12);
1387 eprintln!(
1388 "Literals header: regen={}, comp={}, header_size=5",
1389 regen, comp
1390 );
1391 eprintln!("Total literals section: {}", 5 + comp);
1392
1393 let weights_header = block_data[5];
1395 eprintln!(
1396 "Huffman weights header byte: {:02x} ({})",
1397 weights_header, weights_header
1398 );
1399
1400 use crate::huffman::HuffmanEncoder;
1402 if let Some(test_encoder) = HuffmanEncoder::build(&input) {
1403 let test_weights = test_encoder.serialize_weights();
1404 eprintln!(
1405 "Encoder produced weights: first 10 bytes = {:02x?}",
1406 &test_weights[..10.min(test_weights.len())]
1407 );
1408 eprintln!("Weights length = {}", test_weights.len());
1409 }
1410
1411 let seq_pos = 5 + comp;
1413 if block_data.len() > seq_pos {
1414 eprintln!("Sequences start byte: {:02x}", block_data[seq_pos]);
1415 }
1416 }
1417
1418 match decompressor.decompress(&compressed) {
1419 Ok(decompressed) => {
1420 assert_eq!(
1421 decompressed, input,
1422 "LLM weights roundtrip failed for size {}",
1423 size
1424 );
1425 }
1426 Err(e) => {
1427 eprintln!("Decompression failed for size {}: {:?}", size, e);
1428 if compressed.len() > 12 {
1430 eprintln!("Frame header bytes: {:02x?}", &compressed[..12]);
1431 }
1432 panic!("Decompression failed for size {}: {:?}", size, e);
1433 }
1434 }
1435 }
1436 }
1437
1438 #[test]
1439 fn test_roundtrip_large_pattern_block() {
1440 let compressor = ZstdCompressor::new();
1441 let decompressor = ZstdDecompressor::new();
1442
1443 let pattern = b"0123456789";
1446 let mut input = Vec::new();
1447 for _ in 0..100 {
1448 input.extend_from_slice(pattern);
1449 }
1450
1451 let compressed = compressor.compress(&input).unwrap();
1452 let decompressed = decompressor.decompress(&compressed).unwrap();
1453
1454 assert_eq!(decompressed, input);
1455 }
1456
1457 #[test]
1462 fn test_custom_table_in_zstd_frame() {
1463 let custom_tables = CustomFseTables::new();
1465 let compressor = ZstdCompressor::with_custom_tables(custom_tables);
1466 let decompressor = ZstdDecompressor::new();
1467
1468 let data = b"ABCDABCDABCDABCD".repeat(100);
1470 let compressed = compressor.compress(&data).unwrap();
1471 let decompressed = decompressor.decompress(&compressed).unwrap();
1472
1473 assert_eq!(decompressed, data);
1474 }
1475
1476 #[test]
1477 fn test_custom_tables_with_level() {
1478 let custom_tables = CustomFseTables::new();
1480 let compressor =
1481 ZstdCompressor::with_level_and_tables(CompressionLevel::Best, custom_tables);
1482 let decompressor = ZstdDecompressor::new();
1483
1484 let data = b"Test data for custom tables with compression level.".repeat(50);
1485 let compressed = compressor.compress(&data).unwrap();
1486 let decompressed = decompressor.decompress(&compressed).unwrap();
1487
1488 assert_eq!(decompressed, data);
1489 assert_eq!(compressor.level(), CompressionLevel::Best);
1490 }
1491
1492 #[test]
1493 fn test_custom_tables_api() {
1494 let tables = CustomFseTables::new();
1496 assert!(!tables.has_custom_tables());
1497
1498 let ll_table = fse::cached_ll_table().clone();
1500 let tables_with_ll = CustomFseTables::new().with_ll_table(ll_table);
1501 assert!(tables_with_ll.has_custom_tables());
1502 assert!(tables_with_ll.ll_table.is_some());
1503 assert!(tables_with_ll.of_table.is_none());
1504 assert!(tables_with_ll.ml_table.is_none());
1505 }
1506
1507 #[test]
1508 fn test_compressor_with_custom_tables_getter() {
1509 let tables = CustomFseTables::new();
1511 let compressor = ZstdCompressor::with_custom_tables(tables);
1512 assert!(compressor.custom_tables().is_some());
1513
1514 let default_compressor = ZstdCompressor::new();
1515 assert!(default_compressor.custom_tables().is_none());
1516 }
1517
1518 #[test]
1523 fn test_huffman_integration_with_zstd() {
1524 let training_data = b"The quick brown fox jumps over the lazy dog. ".repeat(100);
1526 let encoder =
1527 huffman::HuffmanEncoder::build(&training_data).expect("Should build Huffman encoder");
1528
1529 let custom_huffman = CustomHuffmanTable::new(encoder);
1531 let compressor = ZstdCompressor::with_custom_huffman(custom_huffman);
1532 let decompressor = ZstdDecompressor::new();
1533
1534 let test_data = b"The lazy fox quickly jumps over the brown dog. ".repeat(50);
1536 let compressed = compressor.compress(&test_data).unwrap();
1537 let decompressed = decompressor.decompress(&compressed).unwrap();
1538
1539 assert_eq!(decompressed, test_data);
1540 }
1541
1542 #[test]
1543 fn test_huffman_encoder_from_weights() {
1544 let mut weights = vec![0u8; 256];
1546 weights[b'a' as usize] = 8; weights[b'b' as usize] = 7;
1549 weights[b'c' as usize] = 6;
1550 weights[b'd' as usize] = 5;
1551 weights[b'e' as usize] = 4;
1552
1553 let encoder =
1554 huffman::HuffmanEncoder::from_weights(&weights).expect("Should build from weights");
1555
1556 assert_eq!(encoder.num_symbols(), 5);
1558 assert!(encoder.max_bits() <= 11); let codes = encoder.get_codes();
1562 assert!(codes[b'a' as usize].num_bits > 0);
1563 assert!(codes[b'b' as usize].num_bits > 0);
1564 }
1565
1566 #[test]
1567 fn test_custom_huffman_api() {
1568 let data = b"test data for huffman".repeat(100);
1570 let encoder = huffman::HuffmanEncoder::build(&data).expect("Should build encoder");
1571
1572 let custom_huffman = CustomHuffmanTable::new(encoder);
1573
1574 let codes = custom_huffman.encoder().get_codes();
1576 assert!(codes[b't' as usize].num_bits > 0);
1577 }
1578
1579 #[test]
1580 fn test_compressor_with_all_options() {
1581 let sample_data = b"Sample data for training ".repeat(100);
1583
1584 let custom_fse = CustomFseTables::new();
1586 let encoder = huffman::HuffmanEncoder::build(&sample_data).expect("Should build encoder");
1587 let custom_huffman = CustomHuffmanTable::new(encoder);
1588
1589 let compressor = ZstdCompressor::with_all_options(
1591 CompressionLevel::Default,
1592 Some(custom_fse),
1593 Some(custom_huffman),
1594 );
1595 let decompressor = ZstdDecompressor::new();
1596
1597 let test_data = b"Sample text for compression testing ".repeat(50);
1599 let compressed = compressor.compress(&test_data).unwrap();
1600 let decompressed = decompressor.decompress(&compressed).unwrap();
1601
1602 assert_eq!(decompressed, test_data);
1603
1604 assert!(compressor.custom_tables().is_some());
1606 assert!(compressor.custom_huffman().is_some());
1607 }
1608
1609 #[test]
1610 fn test_custom_huffman_getter() {
1611 let data = b"test".repeat(100);
1613 let encoder = huffman::HuffmanEncoder::build(&data).unwrap();
1614 let custom = CustomHuffmanTable::new(encoder);
1615
1616 let compressor = ZstdCompressor::with_custom_huffman(custom);
1617 assert!(compressor.custom_huffman().is_some());
1618
1619 let default_compressor = ZstdCompressor::new();
1620 assert!(default_compressor.custom_huffman().is_none());
1621 }
1622}
1623
1624#[cfg(test)]
1625mod huffman_debug_tests {
1626 use crate::huffman::{build_table_from_weights, parse_huffman_weights, HuffmanEncoder};
1627
1628 fn generate_text_like_data(size: usize) -> Vec<u8> {
1629 let words = [
1630 "the ",
1631 "quick ",
1632 "brown ",
1633 "fox ",
1634 "jumps ",
1635 "over ",
1636 "lazy ",
1637 "dog ",
1638 "compression ",
1639 "algorithm ",
1640 "performance ",
1641 "benchmark ",
1642 "testing ",
1643 ];
1644 let mut data = Vec::with_capacity(size);
1645 let mut i = 0;
1646 while data.len() < size {
1647 let word = words[i % words.len()];
1648 let remaining = size - data.len();
1649 let to_copy = remaining.min(word.len());
1650 data.extend_from_slice(&word.as_bytes()[..to_copy]);
1651 i += 1;
1652 }
1653 data
1654 }
1655
1656 #[test]
1657 fn test_trace_huffman_weights_text() {
1658 let data = generate_text_like_data(20000);
1660
1661 let encoder = HuffmanEncoder::build(&data);
1662 if encoder.is_none() {
1663 println!("Encoder returned None - Huffman not suitable for data");
1664 return;
1665 }
1666 let encoder = encoder.unwrap();
1667 let weights = encoder.serialize_weights();
1668
1669 println!(
1670 "Serialized weights: {} bytes, header={}",
1671 weights.len(),
1672 weights[0]
1673 );
1674 let num_symbols = (weights[0] - 127) as usize;
1675 println!("Number of symbols from header: {}", num_symbols);
1676
1677 let (parsed_weights, consumed) = parse_huffman_weights(&weights).expect("Should parse");
1679 println!(
1680 "Parsed {} weights, consumed {} bytes",
1681 parsed_weights.len(),
1682 consumed
1683 );
1684
1685 let non_zero: Vec<_> = parsed_weights
1687 .iter()
1688 .enumerate()
1689 .filter(|&(_, &w)| w > 0)
1690 .map(|(i, &w)| (i as u8 as char, w))
1691 .collect();
1692 println!(
1693 "Non-zero weights ({} total): {:?}",
1694 non_zero.len(),
1695 non_zero
1696 );
1697
1698 let max_w = *parsed_weights.iter().max().unwrap_or(&0);
1700 let weight_sum: u64 = parsed_weights
1701 .iter()
1702 .filter(|&&w| w > 0)
1703 .map(|&w| 1u64 << w)
1704 .sum();
1705 println!("Max weight: {}, sum(2^w): {}", max_w, weight_sum);
1706 println!("Expected sum: 2^{} = {}", max_w + 1, 1u64 << (max_w + 1));
1707
1708 let mut bl_count = vec![0u32; max_w as usize + 2];
1710 for &w in &parsed_weights {
1711 if w > 0 {
1712 let code_len = (max_w + 1 - w) as usize;
1713 if code_len < bl_count.len() {
1714 bl_count[code_len] += 1;
1715 }
1716 }
1717 }
1718
1719 let kraft_sum: u64 = bl_count
1720 .iter()
1721 .enumerate()
1722 .skip(1)
1723 .filter(|&(len, _)| len <= max_w as usize)
1724 .map(|(len, &count)| {
1725 let contribution = 1u64 << (max_w as usize - len);
1726 contribution * count as u64
1727 })
1728 .sum();
1729 let expected_kraft = 1u64 << max_w;
1730 println!(
1731 "Kraft check: sum={}, expected={} (ratio: {})",
1732 kraft_sum,
1733 expected_kraft,
1734 kraft_sum as f64 / expected_kraft as f64
1735 );
1736
1737 let result = build_table_from_weights(parsed_weights.clone());
1739 println!("Build result: {:?}", result.is_ok());
1740 if let Err(e) = &result {
1741 println!("Error: {:?}", e);
1742 }
1743 }
1744}
1745
1746#[cfg(test)]
1747mod debug_tests {
1748 use super::*;
1749 use crate::compress::CompressContext;
1750 use crate::huffman::HuffmanEncoder;
1751 use haagenti_core::CompressionLevel;
1752
1753 fn generate_text_data(size: usize) -> Vec<u8> {
1754 let words = [
1755 "the ",
1756 "quick ",
1757 "brown ",
1758 "fox ",
1759 "jumps ",
1760 "over ",
1761 "lazy ",
1762 "dog ",
1763 "compression ",
1764 "algorithm ",
1765 "performance ",
1766 "benchmark ",
1767 "testing ",
1768 "data ",
1769 "stream ",
1770 "encode ",
1771 "decode ",
1772 "entropy ",
1773 "symbol ",
1774 "table ",
1775 ];
1776 let mut data = Vec::with_capacity(size);
1777 let mut i = 0;
1778 while data.len() < size {
1779 let word = words[i % words.len()];
1780 let remaining = size - data.len();
1781 let to_copy = remaining.min(word.len());
1782 data.extend_from_slice(&word.as_bytes()[..to_copy]);
1783 i += 1;
1784 }
1785 data
1786 }
1787
1788 #[test]
1789 fn test_trace_100kb_text() {
1790 let data = generate_text_data(102400);
1791
1792 let mut freq = [0u64; 256];
1794 for &b in &data {
1795 freq[b as usize] += 1;
1796 }
1797 let unique_count = freq.iter().filter(|&&f| f > 0).count();
1798 println!("100KB text: {} unique symbols", unique_count);
1799
1800 let encoder = HuffmanEncoder::build(&data);
1802 println!("Huffman encoder built: {}", encoder.is_some());
1803
1804 if let Some(enc) = &encoder {
1805 let estimated = enc.estimate_size(&data);
1806 println!("Estimated size: {} (original: {})", estimated, data.len());
1807
1808 let compressed = enc.encode(&data);
1809 let weights = enc.serialize_weights();
1810 println!(
1811 "Actual compressed: {} + {} weights = {}",
1812 compressed.len(),
1813 weights.len(),
1814 compressed.len() + weights.len()
1815 );
1816 }
1817
1818 let mut ctx = CompressContext::new(CompressionLevel::Default);
1820 let result = ctx.compress(&data).unwrap();
1821 println!(
1822 "Full compression: {} -> {} bytes ({:.2}x)",
1823 data.len(),
1824 result.len(),
1825 data.len() as f64 / result.len() as f64
1826 );
1827 }
1828}
1829
1830#[cfg(test)]
1831mod debug_tests2 {
1832 use super::*;
1833 use crate::compress::CompressContext;
1834 use crate::huffman::HuffmanEncoder;
1835 use haagenti_core::CompressionLevel;
1836 use rand::rngs::StdRng;
1837 use rand::{Rng, SeedableRng};
1838
1839 fn generate_text_random(size: usize) -> Vec<u8> {
1840 let words = [
1841 "the ",
1842 "quick ",
1843 "brown ",
1844 "fox ",
1845 "jumps ",
1846 "over ",
1847 "lazy ",
1848 "dog ",
1849 "compression ",
1850 "algorithm ",
1851 "performance ",
1852 "benchmark ",
1853 "testing ",
1854 "data ",
1855 "stream ",
1856 "encode ",
1857 "decode ",
1858 "entropy ",
1859 "symbol ",
1860 "table ",
1861 ];
1862 let mut rng = StdRng::seed_from_u64(456);
1863 let mut data = Vec::with_capacity(size);
1864 while data.len() < size {
1865 let word = words[rng.gen_range(0..words.len())];
1866 let remaining = size - data.len();
1867 let to_copy = remaining.min(word.len());
1868 data.extend_from_slice(&word.as_bytes()[..to_copy]);
1869 }
1870 data
1871 }
1872
1873 #[test]
1874 fn test_trace_100kb_text_random() {
1875 let data = generate_text_random(102400);
1876
1877 let mut freq = [0u64; 256];
1879 for &b in &data {
1880 freq[b as usize] += 1;
1881 }
1882 let unique_count = freq.iter().filter(|&&f| f > 0).count();
1883 println!("100KB random text: {} unique symbols", unique_count);
1884
1885 let mut freqs: Vec<_> = freq.iter().enumerate().filter(|&(_, f)| *f > 0).collect();
1887 freqs.sort_by(|a, b| b.1.cmp(a.1));
1888 println!(
1889 "Top frequencies: {:?}",
1890 freqs
1891 .iter()
1892 .take(10)
1893 .map(|(i, f)| ((*i as u8) as char, *f))
1894 .collect::<Vec<_>>()
1895 );
1896
1897 let encoder = HuffmanEncoder::build(&data);
1899 println!("Huffman encoder built: {}", encoder.is_some());
1900
1901 if let Some(enc) = &encoder {
1902 let estimated = enc.estimate_size(&data);
1903 println!("Estimated size: {} (original: {})", estimated, data.len());
1904 }
1905
1906 let mut ctx = CompressContext::new(CompressionLevel::Default);
1908 let result = ctx.compress(&data).unwrap();
1909 println!(
1910 "Full compression: {} -> {} bytes ({:.2}x)",
1911 data.len(),
1912 result.len(),
1913 data.len() as f64 / result.len() as f64
1914 );
1915 }
1916}
1917
1918#[cfg(test)]
1919mod large_tests {
1920 use super::*;
1921
1922 #[test]
1926 #[ignore = "Pre-existing bug: checksum mismatch at 65KB+ sizes"]
1927 fn test_benchmark_text_65kb() {
1928 let pattern = b"The quick brown fox jumps over the lazy dog. ";
1929 let mut data = Vec::with_capacity(65536);
1930 while data.len() < 65536 {
1931 data.extend_from_slice(pattern);
1932 }
1933 data.truncate(65536);
1934
1935 let compressor = ZstdCompressor::new();
1936 let compressed = compressor.compress(&data).expect("Compression failed");
1937
1938 let decompressor = ZstdDecompressor::new();
1939 let decompressed = decompressor
1940 .decompress(&compressed)
1941 .expect("Decompression failed");
1942
1943 assert_eq!(data.len(), decompressed.len(), "Length mismatch");
1944 assert_eq!(data, decompressed, "Content mismatch");
1945 }
1946
1947 #[test]
1948 fn test_roundtrip_16kb() {
1949 let pattern = b"The quick brown fox jumps over the lazy dog. ";
1951 let mut data = Vec::with_capacity(16384);
1952 while data.len() < 16384 {
1953 data.extend_from_slice(pattern);
1954 }
1955 data.truncate(16384);
1956
1957 let compressor = ZstdCompressor::new();
1958 let compressed = compressor.compress(&data).expect("Compression failed");
1959
1960 let decompressor = ZstdDecompressor::new();
1961 let decompressed = decompressor
1962 .decompress(&compressed)
1963 .expect("Decompression failed");
1964
1965 assert_eq!(data.len(), decompressed.len(), "Length mismatch");
1966 assert_eq!(data, decompressed, "Content mismatch");
1967 }
1968}
1969
1970#[cfg(test)]
1972mod cross_library_tests {
1973 use super::*;
1974
1975 fn generate_test_data(size: usize) -> Vec<u8> {
1976 let pattern = b"The quick brown fox jumps over the lazy dog. ";
1977 let mut data = Vec::with_capacity(size);
1978 while data.len() < size {
1979 data.extend_from_slice(pattern);
1980 }
1981 data.truncate(size);
1982 data
1983 }
1984
1985 #[test]
1988 fn test_haagenti_compress_zstd_decompress_65kb() {
1989 let data = generate_test_data(65536);
1990
1991 let compressor = ZstdCompressor::new();
1993 let compressed = compressor
1994 .compress(&data)
1995 .expect("Haagenti compression failed");
1996
1997 let result = zstd::decode_all(compressed.as_slice());
1999
2000 match result {
2001 Ok(decompressed) => {
2002 assert_eq!(data.len(), decompressed.len(), "Length mismatch");
2003 if data != decompressed {
2004 for (i, (a, b)) in data.iter().zip(decompressed.iter()).enumerate() {
2006 if a != b {
2007 println!(
2008 "First divergence at byte {}: expected {:02x}, got {:02x}",
2009 i, a, b
2010 );
2011 break;
2012 }
2013 }
2014 panic!("Content mismatch - haagenti compression produces invalid output for reference zstd");
2015 }
2016 }
2017 Err(e) => {
2018 println!(
2019 "Reference zstd failed to decompress haagenti output: {:?}",
2020 e
2021 );
2022 println!("This confirms the bug is in HAAGENTI COMPRESSION");
2023 panic!("Haagenti compression output is invalid");
2024 }
2025 }
2026 }
2027
2028 #[test]
2033 fn test_zstd_reference_raw_blocks() {
2034 for size in [100, 200] {
2036 let data: Vec<u8> = (0..size).map(|i| ((i * 17 + 31) % 256) as u8).collect();
2037 let compressed =
2038 zstd::encode_all(data.as_slice(), 1).expect("Reference zstd compression failed");
2039
2040 let decompressor = ZstdDecompressor::new();
2041 let decompressed = decompressor
2042 .decompress(&compressed)
2043 .expect(&format!("Failed to decompress size {}", size));
2044 assert_eq!(data, decompressed, "Size {} content mismatch", size);
2045 }
2046 }
2047
2048 #[test]
2054 #[ignore = "Pre-existing bug: reference zstd compatibility for compressed blocks"]
2055 fn test_zstd_compress_haagenti_decompress_65kb() {
2056 let data = generate_test_data(65536);
2057
2058 let compressed =
2060 zstd::encode_all(data.as_slice(), 3).expect("Reference zstd compression failed");
2061
2062 println!("Compressed size: {} bytes", compressed.len());
2064 print!("First 64 bytes: ");
2065 for (i, &b) in compressed.iter().take(64).enumerate() {
2066 if i % 16 == 0 {
2067 print!("\n ");
2068 }
2069 print!("{:02x} ", b);
2070 }
2071 println!();
2072
2073 if compressed.len() >= 4 {
2075 let magic =
2076 u32::from_le_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]);
2077 println!("Magic: 0x{:08x} (expected 0xfd2fb528)", magic);
2078 }
2079 if compressed.len() >= 5 {
2080 let fhd = compressed[4];
2081 println!("Frame header descriptor: 0x{:02x}", fhd);
2082 println!(" - Checksum flag: {}", (fhd >> 2) & 1);
2083 println!(" - Single segment flag: {}", (fhd >> 5) & 1);
2084 println!(" - Dictionary ID flag: {}", fhd & 0x03);
2085 println!(" - FCS field size: {}", (fhd >> 6) & 0x03);
2086 }
2087
2088 let decompressor = ZstdDecompressor::new();
2090 let result = decompressor.decompress(&compressed);
2091
2092 match result {
2093 Ok(decompressed) => {
2094 assert_eq!(data.len(), decompressed.len(), "Length mismatch");
2095 if data != decompressed {
2096 for (i, (a, b)) in data.iter().zip(decompressed.iter()).enumerate() {
2098 if a != b {
2099 println!(
2100 "First divergence at byte {}: expected {:02x}, got {:02x}",
2101 i, a, b
2102 );
2103 break;
2104 }
2105 }
2106 panic!("Content mismatch - haagenti decompression produces incorrect output");
2107 }
2108 }
2109 Err(e) => {
2110 println!(
2111 "Haagenti failed to decompress reference zstd output: {:?}",
2112 e
2113 );
2114 println!("This confirms the bug is in HAAGENTI DECOMPRESSION");
2115 panic!("Haagenti decompression failed on valid zstd data");
2116 }
2117 }
2118 }
2119
2120 #[test]
2122 fn test_find_threshold_size() {
2123 let sizes: Vec<usize> = (16..=32).map(|k| k * 1024).collect();
2125
2126 for size in sizes {
2127 let data = generate_test_data(size);
2128 let compressor = ZstdCompressor::new();
2129 let decompressor = ZstdDecompressor::new();
2130
2131 let compressed = compressor.compress(&data).expect("Compression failed");
2132 let result = decompressor.decompress(&compressed);
2133
2134 match result {
2135 Ok(decompressed) if decompressed == data => {
2136 println!("Size {} ({}KB): OK", size, size / 1024);
2137 }
2138 Ok(decompressed) => {
2139 println!(
2140 "Size {} ({}KB): CONTENT MISMATCH (len: {} vs {})",
2141 size,
2142 size / 1024,
2143 data.len(),
2144 decompressed.len()
2145 );
2146 }
2147 Err(e) => {
2148 println!("Size {} ({}KB): ERROR - {:?}", size, size / 1024, e);
2149 }
2150 }
2151 }
2152 }
2153
2154 #[test]
2156 fn test_analyze_compression_failure() {
2157 for &size in &[16384, 20000, 24000, 28000, 32768] {
2159 let data = generate_test_data(size);
2160
2161 let compressor = ZstdCompressor::new();
2163 let haagenti_compressed = compressor.compress(&data).expect("Compression failed");
2164
2165 let zstd_compressed = zstd::encode_all(data.as_slice(), 3).expect("zstd failed");
2167
2168 let zstd_result = zstd::decode_all(haagenti_compressed.as_slice());
2170
2171 println!(
2172 "Size {}: haagenti={} bytes, zstd={} bytes, zstd_decode_haagenti={:?}",
2173 size,
2174 haagenti_compressed.len(),
2175 zstd_compressed.len(),
2176 zstd_result
2177 .as_ref()
2178 .map(|v| v.len())
2179 .map_err(|e| format!("{:?}", e))
2180 );
2181 }
2182 }
2183
2184 #[test]
2186 fn test_check_block_boundaries() {
2187 let sizes = [8192, 16384, 16385, 20000, 24576, 32768, 32769];
2189
2190 for &size in &sizes {
2191 let data = generate_test_data(size);
2192 let compressor = ZstdCompressor::new();
2193
2194 let compressed = compressor.compress(&data).expect("Compression failed");
2195
2196 let zstd_result = zstd::decode_all(compressed.as_slice());
2198
2199 println!(
2200 "Size {}: compressed={} bytes, zstd_decode={:?}",
2201 size,
2202 compressed.len(),
2203 match &zstd_result {
2204 Ok(v) if *v == data => "OK".to_string(),
2205 Ok(v) => format!("MISMATCH (len {})", v.len()),
2206 Err(e) => format!("ERROR: {}", e),
2207 }
2208 );
2209 }
2210 }
2211
2212 #[test]
2214 fn test_debug_compression_trace() {
2215 let size = 25600; let data = generate_test_data(size);
2217
2218 println!("Input size: {} bytes", data.len());
2219 println!("First 50 bytes: {:?}", &data[..50.min(data.len())]);
2220
2221 let compressor = ZstdCompressor::new();
2222 let compressed = compressor.compress(&data).expect("Compression failed");
2223
2224 println!("Compressed size: {} bytes", compressed.len());
2225 println!(
2226 "Compressed header: {:02x?}",
2227 &compressed[..20.min(compressed.len())]
2228 );
2229
2230 let magic =
2232 u32::from_le_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]);
2233 println!("Magic: 0x{:08X} (valid={})", magic, magic == 0xFD2FB528);
2234
2235 let descriptor = compressed[4];
2236 let has_checksum = (descriptor & 0x04) != 0;
2237 let single_segment = (descriptor & 0x20) != 0;
2238 let fcs_size = match descriptor >> 6 {
2239 0 => {
2240 if single_segment {
2241 1
2242 } else {
2243 0
2244 }
2245 }
2246 1 => 2,
2247 2 => 4,
2248 3 => 8,
2249 _ => 0,
2250 };
2251 println!(
2252 "Descriptor: 0x{:02X}, checksum={}, single_segment={}, fcs_size={}",
2253 descriptor, has_checksum, single_segment, fcs_size
2254 );
2255
2256 let fcs_start = if single_segment { 5 } else { 6 };
2258 let fcs = match fcs_size {
2259 1 => compressed[fcs_start] as u64,
2260 2 => {
2261 u16::from_le_bytes([compressed[fcs_start], compressed[fcs_start + 1]]) as u64 + 256
2262 }
2263 4 => u32::from_le_bytes([
2264 compressed[fcs_start],
2265 compressed[fcs_start + 1],
2266 compressed[fcs_start + 2],
2267 compressed[fcs_start + 3],
2268 ]) as u64,
2269 8 => u64::from_le_bytes(compressed[fcs_start..fcs_start + 8].try_into().unwrap()),
2270 _ => 0,
2271 };
2272 println!("Frame Content Size: {} (input was {})", fcs, size);
2273
2274 let block_start = fcs_start + fcs_size;
2276 let block_header = u32::from_le_bytes([
2277 compressed[block_start],
2278 compressed[block_start + 1],
2279 compressed[block_start + 2],
2280 0,
2281 ]);
2282 let is_last = (block_header & 1) != 0;
2283 let block_type = (block_header >> 1) & 3;
2284 let block_size = (block_header >> 3) as usize;
2285
2286 let block_type_name = match block_type {
2287 0 => "Raw",
2288 1 => "RLE",
2289 2 => "Compressed",
2290 _ => "Reserved",
2291 };
2292 println!(
2293 "Block: type={} ({}), size={}, is_last={}",
2294 block_type, block_type_name, block_size, is_last
2295 );
2296
2297 let result = zstd::decode_all(compressed.as_slice());
2299 println!(
2300 "Reference zstd decode: {:?}",
2301 result.as_ref().map(|v| v.len())
2302 );
2303 }
2304
2305 #[test]
2307 fn test_debug_huffman_encoding() {
2308 use crate::huffman::HuffmanEncoder;
2309
2310 let size = 25600;
2311 let data = generate_test_data(size);
2312
2313 let mut freq = [0u64; 256];
2315 for &b in &data {
2316 freq[b as usize] += 1;
2317 }
2318 let unique_count = freq.iter().filter(|&&f| f > 0).count();
2319 println!(
2320 "Input: {} bytes, {} unique symbols",
2321 data.len(),
2322 unique_count
2323 );
2324
2325 let mut freqs: Vec<_> = freq
2327 .iter()
2328 .enumerate()
2329 .filter(|&(_, &f)| f > 0)
2330 .map(|(i, &f)| (i as u8, f))
2331 .collect();
2332 freqs.sort_by(|a, b| b.1.cmp(&a.1));
2333 println!(
2334 "Symbol frequencies (top 15): {:?}",
2335 freqs
2336 .iter()
2337 .take(15)
2338 .map(|(b, f)| ((*b as char), *f))
2339 .collect::<Vec<_>>()
2340 );
2341
2342 if let Some(encoder) = HuffmanEncoder::build(&data) {
2344 println!(
2345 "Huffman encoder built: max_bits={}, num_symbols={}",
2346 encoder.max_bits(),
2347 encoder.num_symbols()
2348 );
2349
2350 let codes = encoder.get_codes();
2352 let mut symbols_with_codes = 0;
2353 let mut symbols_without_codes = 0;
2354
2355 for (i, code) in codes.iter().enumerate() {
2356 if freq[i] > 0 {
2357 if code.num_bits > 0 {
2358 symbols_with_codes += 1;
2359 } else {
2360 symbols_without_codes += 1;
2361 println!("WARNING: Symbol {} (freq={}) has no code!", i, freq[i]);
2362 }
2363 }
2364 }
2365 println!(
2366 "Symbols with codes: {}, without codes: {}",
2367 symbols_with_codes, symbols_without_codes
2368 );
2369
2370 let compressed = encoder.encode(&data);
2372 let weights = encoder.serialize_weights();
2373 println!(
2374 "Huffman output: {} bytes data + {} bytes weights = {} total",
2375 compressed.len(),
2376 weights.len(),
2377 compressed.len() + weights.len()
2378 );
2379
2380 let estimated = encoder.estimate_size(&data);
2382 println!(
2383 "Estimated: {} bytes, actual: {} bytes",
2384 estimated,
2385 compressed.len() + weights.len()
2386 );
2387 } else {
2388 println!("Huffman encoder build failed!");
2389 }
2390 }
2391
2392 #[test]
2394 fn test_debug_match_finder() {
2395 use crate::compress::MatchFinder;
2396
2397 let size = 25600;
2398 let data = generate_test_data(size);
2399
2400 println!("Input size: {} bytes", data.len());
2401 println!(
2402 "Pattern: first 45 bytes = {:?}",
2403 String::from_utf8_lossy(&data[..45])
2404 );
2405
2406 let mut mf = MatchFinder::new(16);
2407 let matches = mf.find_matches(&data);
2408
2409 println!("Total matches found: {}", matches.len());
2410
2411 for (i, m) in matches.iter().take(10).enumerate() {
2413 println!(
2414 "Match {}: pos={}, offset={}, length={}",
2415 i, m.position, m.offset, m.length
2416 );
2417 }
2418
2419 let total_match_len: usize = matches.iter().map(|m| m.length).sum();
2421 println!(
2422 "Total match coverage: {} bytes ({:.1}% of input)",
2423 total_match_len,
2424 100.0 * total_match_len as f64 / data.len() as f64
2425 );
2426
2427 if matches.len() == 1 {
2429 let m = &matches[0];
2430 println!("\nSingle match analysis:");
2431 println!(
2432 " Position {} to {} (length {})",
2433 m.position,
2434 m.position + m.length,
2435 m.length
2436 );
2437 println!(" References data at offset {} back", m.offset);
2438 println!(
2439 " Expected decompressed output: literals[0..{}] + match copy",
2440 m.position
2441 );
2442 }
2443 }
2444
2445 #[test]
2447 fn test_debug_block_encoding() {
2448 let size = 25600;
2449 let data = generate_test_data(size);
2450
2451 let compressor = ZstdCompressor::new();
2453 let full_compressed = compressor.compress(&data).unwrap();
2454 println!("Full frame: {} bytes", full_compressed.len());
2455
2456 let block_start = 8; let block_header = u32::from_le_bytes([
2459 full_compressed[block_start],
2460 full_compressed[block_start + 1],
2461 full_compressed[block_start + 2],
2462 0,
2463 ]);
2464 let is_last = (block_header & 1) != 0;
2465 let btype = (block_header >> 1) & 3;
2466 let block_size = (block_header >> 3) as usize;
2467 println!(
2468 "Block header: type={}, size={}, is_last={}",
2469 btype, block_size, is_last
2470 );
2471
2472 if btype == 2 {
2474 let lit_header = full_compressed[block_start + 3];
2475 let lit_type = lit_header & 0x03;
2476 let lit_size_format = (lit_header >> 2) & 0x03;
2477 println!(
2478 "Literals section: type={}, size_format={}",
2479 lit_type, lit_size_format
2480 );
2481
2482 match (lit_type, lit_size_format) {
2484 (2, 0) => {
2485 let b0 = full_compressed[block_start + 3];
2487 let b1 = full_compressed[block_start + 4];
2488 let b2 = full_compressed[block_start + 5];
2489 let regen = ((b0 as u32 >> 4) & 0xF) | (((b1 as u32) & 0x3F) << 4);
2490 let comp = ((b1 as u32 >> 6) & 0x3) | ((b2 as u32) << 2);
2491 println!("Size_Format=0: regen={}, comp={}", regen, comp);
2492 }
2493 (2, 1) => {
2494 let b0 = full_compressed[block_start + 3];
2496 let b1 = full_compressed[block_start + 4];
2497 let b2 = full_compressed[block_start + 5];
2498 let b3 = full_compressed[block_start + 6];
2499 let regen =
2500 ((b0 as u32 >> 4) & 0xF) | ((b1 as u32) << 4) | (((b2 as u32) & 0x3) << 12);
2501 let comp = ((b2 as u32 >> 2) & 0x3F) | ((b3 as u32) << 6);
2502 println!("Size_Format=1: regen={}, comp={}", regen, comp);
2503 }
2504 (2, 2) => {
2505 let b0 = full_compressed[block_start + 3];
2507 let b1 = full_compressed[block_start + 4];
2508 let b2 = full_compressed[block_start + 5];
2509 let b3 = full_compressed[block_start + 6];
2510 let b4 = full_compressed[block_start + 7];
2511 let regen = ((b0 as u32 >> 4) & 0xF)
2512 | ((b1 as u32) << 4)
2513 | (((b2 as u32) & 0x3F) << 12);
2514 let comp = ((b2 as u32 >> 6) & 0x3) | ((b3 as u32) << 2) | ((b4 as u32) << 10);
2515 println!("Size_Format=2: regen={}, comp={}", regen, comp);
2516 }
2517 (2, 3) => {
2518 let b0 = full_compressed[block_start + 3];
2520 let b1 = full_compressed[block_start + 4];
2521 let b2 = full_compressed[block_start + 5];
2522 let regen = ((b0 as u32 >> 4) & 0xF) | (((b1 as u32) & 0x3F) << 4);
2523 let comp = ((b1 as u32 >> 6) & 0x3) | ((b2 as u32) << 2);
2524 println!(
2525 "Size_Format=3 (single stream): regen={}, comp={}",
2526 regen, comp
2527 );
2528 }
2529 _ => {}
2530 }
2531 }
2532
2533 println!("\nBlock data (first 60 bytes):");
2535 let block_data_start = block_start + 3;
2536 let block_end = (block_data_start + block_size).min(full_compressed.len() - 4);
2537 for (i, chunk) in full_compressed[block_data_start..block_end]
2538 .chunks(20)
2539 .enumerate()
2540 {
2541 println!(" {:04x}: {:02x?}", i * 20, chunk);
2542 }
2543 }
2544
2545 #[test]
2550 fn test_fse_bitstream_comparison() {
2551 use crate::block::Sequence;
2552 use crate::compress::encode_sequences_fse;
2553 use crate::fse::{
2554 FseTable, LITERAL_LENGTH_ACCURACY_LOG, LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
2555 };
2556 use crate::fse::{MATCH_LENGTH_ACCURACY_LOG, MATCH_LENGTH_DEFAULT_DISTRIBUTION};
2557 use crate::fse::{OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION};
2558
2559 let sequences = vec![
2561 Sequence {
2562 literal_length: 5,
2563 match_length: 10,
2564 offset: 100,
2565 },
2566 Sequence {
2567 literal_length: 3,
2568 match_length: 8,
2569 offset: 50,
2570 },
2571 ];
2572
2573 println!("=== FSE Bitstream Comparison Test ===");
2574 println!("Sequences: {:?}", sequences);
2575
2576 let mut our_output = Vec::new();
2578 let result = encode_sequences_fse(&sequences, &mut our_output);
2579
2580 match result {
2581 Ok(()) => {
2582 println!("\nOur FSE encoding succeeded: {} bytes", our_output.len());
2583 println!("Output bytes: {:02x?}", our_output);
2584
2585 if !our_output.is_empty() {
2587 let seq_count = our_output[0];
2588 println!("Sequence count byte: {}", seq_count);
2589 if our_output.len() > 1 {
2590 let mode_byte = our_output[1];
2591 println!(
2592 "Mode byte: 0x{:02x} (LL={}, OF={}, ML={})",
2593 mode_byte,
2594 (mode_byte >> 6) & 0x3,
2595 (mode_byte >> 4) & 0x3,
2596 (mode_byte >> 2) & 0x3
2597 );
2598 }
2599
2600 if our_output.len() > 2 {
2602 println!("\nBitstream ({} bytes):", our_output.len() - 2);
2603 for (i, b) in our_output[2..].iter().enumerate() {
2604 print!("{:02x} ", b);
2605 if (i + 1) % 16 == 0 {
2606 println!();
2607 }
2608 }
2609 println!();
2610 }
2611 }
2612 }
2613 Err(e) => {
2614 println!("Our FSE encoding failed: {:?}", e);
2615 }
2616 }
2617
2618 println!("\n=== Decode Table Info ===");
2620 let ll_table = FseTable::from_predefined(
2621 &LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
2622 LITERAL_LENGTH_ACCURACY_LOG,
2623 )
2624 .unwrap();
2625 let of_table =
2626 FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
2627 let ml_table = FseTable::from_predefined(
2628 &MATCH_LENGTH_DEFAULT_DISTRIBUTION,
2629 MATCH_LENGTH_ACCURACY_LOG,
2630 )
2631 .unwrap();
2632
2633 println!(
2634 "LL table: accuracy_log={}, size={}",
2635 ll_table.accuracy_log(),
2636 ll_table.size()
2637 );
2638 println!(
2639 "OF table: accuracy_log={}, size={}",
2640 of_table.accuracy_log(),
2641 of_table.size()
2642 );
2643 println!(
2644 "ML table: accuracy_log={}, size={}",
2645 ml_table.accuracy_log(),
2646 ml_table.size()
2647 );
2648 }
2649
2650 #[test]
2652 fn test_analyze_reference_sequence_bitstream() {
2653 let mut data = Vec::new();
2656 for i in 0..50u8 {
2657 data.push(i + 0x30); }
2659 for i in 0..20u8 {
2661 data.push(i + 0x30);
2662 }
2663 let data = &data[..];
2664
2665 println!("=== Analyze Reference Sequence Bitstream ===");
2666 println!(
2667 "Input: {:?} ({} bytes)",
2668 String::from_utf8_lossy(data),
2669 data.len()
2670 );
2671
2672 let compressed = zstd::encode_all(&data[..], 3).expect("compress failed");
2673 println!(
2674 "\nReference compressed ({} bytes): {:02x?}",
2675 compressed.len(),
2676 compressed
2677 );
2678
2679 if compressed.len() >= 4 {
2681 let magic =
2682 u32::from_le_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]);
2683 println!("Magic: 0x{:08x}", magic);
2684 }
2685
2686 if compressed.len() > 4 {
2688 let fhd = compressed[4];
2689 let single_segment = (fhd >> 5) & 0x1 != 0;
2690 let fcs_field = (fhd >> 6) & 0x3;
2691 let fcs_size = match fcs_field {
2692 0 => {
2693 if single_segment {
2694 1
2695 } else {
2696 0
2697 }
2698 }
2699 1 => 2,
2700 2 => 4,
2701 3 => 8,
2702 _ => 0,
2703 };
2704 let window_size = if single_segment { 0 } else { 1 };
2705 let header_end = 5 + window_size + fcs_size;
2706
2707 println!(
2708 "FHD: 0x{:02x}, single_segment={}, fcs_size={}",
2709 fhd, single_segment, fcs_size
2710 );
2711 println!("Header ends at: {}", header_end);
2712
2713 if compressed.len() > header_end + 3 {
2714 let bh = u32::from_le_bytes([
2716 compressed[header_end],
2717 compressed[header_end + 1],
2718 compressed[header_end + 2],
2719 0,
2720 ]);
2721 let last = bh & 1 != 0;
2722 let block_type = (bh >> 1) & 3;
2723 let block_size = (bh >> 3) as usize;
2724
2725 println!("\nBlock at {}:", header_end);
2726 println!(
2727 " Last: {}, Type: {} ({}), Size: {}",
2728 last,
2729 block_type,
2730 match block_type {
2731 0 => "Raw",
2732 1 => "RLE",
2733 2 => "Compressed",
2734 _ => "?",
2735 },
2736 block_size
2737 );
2738
2739 if block_type == 2 && compressed.len() >= header_end + 3 + block_size {
2740 let block_start = header_end + 3;
2741 let block_data = &compressed[block_start..block_start + block_size];
2742 println!(
2743 "\nBlock content ({} bytes): {:02x?}",
2744 block_data.len(),
2745 block_data
2746 );
2747
2748 if !block_data.is_empty() {
2750 let lit_type = block_data[0] & 0x3;
2751 let lit_size_format = (block_data[0] >> 2) & 0x3;
2752 println!(
2753 "\nLiterals type: {} ({})",
2754 lit_type,
2755 match lit_type {
2756 0 => "Raw",
2757 1 => "RLE",
2758 2 => "Compressed",
2759 3 => "Treeless",
2760 _ => "?",
2761 }
2762 );
2763
2764 let (lit_regen_size, lit_header_size) = if lit_type == 0 || lit_type == 1 {
2765 match lit_size_format {
2767 0 | 2 => (((block_data[0] >> 3) & 0x1F) as usize, 1usize),
2768 1 => {
2769 let s = ((block_data[0] >> 4) as usize)
2770 | ((block_data[1] as usize) << 4);
2771 (s, 2)
2772 }
2773 3 => {
2774 let s = ((block_data[0] >> 4) as usize)
2775 | ((block_data[1] as usize) << 4)
2776 | (((block_data[2] & 0x3F) as usize) << 12);
2777 (s, 3)
2778 }
2779 _ => (0, 1),
2780 }
2781 } else {
2782 (0, 0)
2784 };
2785
2786 println!(
2787 "Literals regenerated size: {}, header size: {}",
2788 lit_regen_size, lit_header_size
2789 );
2790
2791 let seq_start = lit_header_size
2793 + if lit_type == 0 {
2794 lit_regen_size
2795 } else {
2796 if lit_type == 1 {
2797 1
2798 } else {
2799 0
2800 }
2801 };
2802 if seq_start < block_data.len() {
2803 println!("\nSequence section at offset {}:", seq_start);
2804 let seq_data = &block_data[seq_start..];
2805 println!(" Sequence data: {:02x?}", seq_data);
2806
2807 if !seq_data.is_empty() {
2808 let seq_count = seq_data[0];
2809 println!(
2810 " Sequence count byte: {} (count = {})",
2811 seq_data[0],
2812 if seq_count < 128 {
2813 seq_count as usize
2814 } else {
2815 ((seq_count as usize - 128) << 8) | seq_data[1] as usize
2816 }
2817 );
2818
2819 let (count, header_len) = if seq_count < 128 {
2820 (seq_count as usize, 1)
2821 } else if seq_count < 255 {
2822 (((seq_count as usize - 128) << 8) | seq_data[1] as usize, 2)
2823 } else {
2824 (
2825 seq_data[1] as usize
2826 | ((seq_data[2] as usize) << 8) + 0x7F00,
2827 3,
2828 )
2829 };
2830
2831 if seq_data.len() > header_len {
2832 let mode_byte = seq_data[header_len];
2833 println!(
2834 " Mode byte: 0x{:02x} (LL={}, OF={}, ML={})",
2835 mode_byte,
2836 (mode_byte >> 6) & 3,
2837 (mode_byte >> 4) & 3,
2838 (mode_byte >> 2) & 3
2839 );
2840 }
2841
2842 if seq_data.len() > header_len + 1 {
2843 let bitstream = &seq_data[header_len + 1..];
2844 println!(
2845 " FSE Bitstream ({} bytes): {:02x?}",
2846 bitstream.len(),
2847 bitstream
2848 );
2849 }
2850 }
2851 }
2852 }
2853 }
2854 }
2855 }
2856
2857 let decompressed = zstd::decode_all(&compressed[..]).expect("decompress failed");
2859 assert_eq!(&decompressed, data);
2860 println!("\nRoundtrip verified!");
2861
2862 use crate::block::Sequence;
2864 use crate::compress::encode_sequences_fse;
2865
2866 let sequences = vec![Sequence {
2869 literal_length: 50,
2870 match_length: 20,
2871 offset: 53,
2872 }];
2873
2874 println!("\n=== Our Encoding ===");
2875 println!("Sequence: ll=50, ml=20, offset_value=53 (actual offset 50)");
2876
2877 let mut our_output = Vec::new();
2878 encode_sequences_fse(&sequences, &mut our_output).expect("encode failed");
2879
2880 println!(
2881 "Our sequence section ({} bytes): {:02x?}",
2882 our_output.len(),
2883 our_output
2884 );
2885 if our_output.len() >= 2 {
2886 println!(" Count: {}", our_output[0]);
2887 println!(" Mode: 0x{:02x}", our_output[1]);
2888 if our_output.len() > 2 {
2889 println!(" Bitstream: {:02x?}", &our_output[2..]);
2890 }
2891 }
2892
2893 let ref_bitstream = &[0x52, 0x69, 0x05, 0x05];
2895 let our_bitstream = if our_output.len() > 2 {
2896 &our_output[2..]
2897 } else {
2898 &[]
2899 };
2900
2901 println!("\n=== Comparison ===");
2902 println!("Reference: {:02x?}", ref_bitstream);
2903 println!("Ours: {:02x?}", our_bitstream);
2904
2905 if ref_bitstream == our_bitstream {
2906 println!("BITSTREAMS MATCH!");
2907 } else {
2908 println!("BITSTREAMS DIFFER!");
2909 decode_bitstream_bits("Reference", ref_bitstream);
2911 decode_bitstream_bits("Ours", our_bitstream);
2912 }
2913 }
2914
2915 #[test]
2918 fn test_reference_decodes_our_fse() {
2919 use haagenti_core::{Compressor, Decompressor};
2920
2921 let data: Vec<u8> = b"ABCD".iter().cycle().take(100).copied().collect();
2924
2925 println!("=== Test Reference Decodes Our FSE ===");
2926 println!("Input: {} bytes", data.len());
2927
2928 let mut mf = crate::compress::LazyMatchFinder::new(16);
2930 let matches = mf.find_matches(&data);
2931 println!("Matches found: {}", matches.len());
2932 for (i, m) in matches.iter().enumerate() {
2933 println!(
2934 " Match[{}]: pos={}, len={}, offset={}",
2935 i, m.position, m.length, m.offset
2936 );
2937 }
2938 let (literals, seqs) = crate::compress::block::matches_to_sequences(&data, &matches);
2939 println!("Sequences: {}", seqs.len());
2940 for (i, s) in seqs.iter().enumerate() {
2941 println!(
2942 " Seq[{}]: ll={}, offset={}, ml={}",
2943 i, s.literal_length, s.offset, s.match_length
2944 );
2945 let enc = crate::compress::EncodedSequence::from_sequence(s);
2946 println!(
2947 " Encoded: ll_code={}, of_code={}, ml_code={}",
2948 enc.ll_code, enc.of_code, enc.ml_code
2949 );
2950 println!(
2951 " Extra: ll_bits={}, of_extra={}, ml_extra={}",
2952 enc.ll_bits, enc.of_extra, enc.ml_extra
2953 );
2954 }
2955
2956 let compressor = ZstdCompressor::new();
2958 let compressed = compressor.compress(&data).expect("our compress failed");
2959 println!("Compressed: {} bytes", compressed.len());
2960 println!("Bytes: {:02x?}", compressed);
2961
2962 match zstd::decode_all(&compressed[..]) {
2964 Ok(decoded) => {
2965 println!("Reference zstd decoded: {} bytes", decoded.len());
2966 if decoded == data {
2967 println!("SUCCESS! Reference zstd correctly decoded our output!");
2968 } else {
2969 println!("MISMATCH! Decoded data differs from original");
2970 println!("Expected: {:?}", data);
2971 println!("Got: {:?}", decoded);
2972 }
2973 assert_eq!(decoded, data, "Reference decode mismatch");
2974 }
2975 Err(e) => {
2976 println!("FAILED: Reference zstd could not decode: {:?}", e);
2977
2978 if compressed.len() >= 4 {
2980 let magic = u32::from_le_bytes([
2981 compressed[0],
2982 compressed[1],
2983 compressed[2],
2984 compressed[3],
2985 ]);
2986 println!("Magic: 0x{:08x}", magic);
2987 }
2988 if compressed.len() > 4 {
2989 let fhd = compressed[4];
2990 println!("FHD: 0x{:02x}", fhd);
2991 }
2992
2993 let decompressor = ZstdDecompressor::new();
2995 match decompressor.decompress(&compressed) {
2996 Ok(decoded) => {
2997 println!("Our decoder succeeded: {} bytes", decoded.len());
2998 if decoded == data {
2999 println!("Our roundtrip works, issue is reference compatibility");
3000 }
3001 }
3002 Err(e2) => {
3003 println!("Our decoder also failed: {:?}", e2);
3004 }
3005 }
3006
3007 panic!("Reference zstd failed to decode our output");
3008 }
3009 }
3010 }
3011
3012 #[test]
3014 fn test_two_sequences() {
3015 use haagenti_core::Compressor;
3016
3017 let data: Vec<u8> = b"ABCD".iter().cycle().take(500).copied().collect();
3022
3023 println!("=== Test Two Sequences ===");
3024 println!("Input: {} bytes", data.len());
3025
3026 let mut mf = crate::compress::LazyMatchFinder::new(16);
3028 let matches = mf.find_matches(&data);
3029 println!("Matches found: {}", matches.len());
3030 for (i, m) in matches.iter().enumerate() {
3031 println!(
3032 " Match[{}]: pos={}, len={}, offset={}",
3033 i, m.position, m.length, m.offset
3034 );
3035 }
3036 let (literals, seqs) = crate::compress::block::matches_to_sequences(&data, &matches);
3037 println!("Sequences: {}", seqs.len());
3038 for (i, s) in seqs.iter().enumerate() {
3039 println!(
3040 " Seq[{}]: ll={}, offset={}, ml={}",
3041 i, s.literal_length, s.offset, s.match_length
3042 );
3043 let enc = crate::compress::EncodedSequence::from_sequence(s);
3044 println!(
3045 " Encoded: ll_code={}, of_code={}, ml_code={}",
3046 enc.ll_code, enc.of_code, enc.ml_code
3047 );
3048 println!(
3049 " Extra: ll_extra={}({} bits), of_extra={}({} bits), ml_extra={}({} bits)",
3050 enc.ll_extra, enc.ll_bits, enc.of_extra, enc.of_bits, enc.ml_extra, enc.ml_bits
3051 );
3052 }
3053
3054 let compressor = ZstdCompressor::new();
3056 let compressed = compressor.compress(&data).expect("our compress failed");
3057 println!("Compressed: {} bytes", compressed.len());
3058 println!("Bytes: {:02x?}", compressed);
3059
3060 let ref_compressed = zstd::encode_all(&data[..], 1).expect("ref compress failed");
3062 println!("Reference compressed: {} bytes", ref_compressed.len());
3063 println!("Reference bytes: {:02x?}", ref_compressed);
3064
3065 use crate::fse::{FseTable, MATCH_LENGTH_ACCURACY_LOG, MATCH_LENGTH_DEFAULT_DISTRIBUTION};
3067 let ml_table = FseTable::from_predefined(
3068 &MATCH_LENGTH_DEFAULT_DISTRIBUTION,
3069 MATCH_LENGTH_ACCURACY_LOG,
3070 )
3071 .unwrap();
3072 println!("\nML code 46 positions in decode table:");
3073 for pos in 0..ml_table.size() {
3074 let entry = ml_table.decode(pos);
3075 if entry.symbol == 46 {
3076 println!(
3077 " Position {}: symbol={}, nb_bits={}, baseline={}",
3078 pos, entry.symbol, entry.num_bits, entry.baseline
3079 );
3080 }
3081 }
3082 let entry63 = ml_table.decode(63);
3084 let entry42 = ml_table.decode(42);
3085 println!("Position 63 decodes to: symbol={}", entry63.symbol);
3086 println!("Position 42 decodes to: symbol={}", entry42.symbol);
3087
3088 match zstd::decode_all(&compressed[..]) {
3090 Ok(decoded) => {
3091 println!("Reference zstd decoded: {} bytes", decoded.len());
3092 if decoded == data {
3093 println!("SUCCESS! Reference zstd correctly decoded our 2-sequence output!");
3094 } else {
3095 println!("MISMATCH! Decoded data differs from original");
3096 }
3097 assert_eq!(decoded, data, "Reference decode mismatch");
3098 }
3099 Err(e) => {
3100 println!("FAILED: Reference zstd could not decode: {:?}", e);
3101 panic!("Reference zstd failed to decode our 2-sequence output");
3102 }
3103 }
3104 }
3105
3106 #[test]
3108 fn test_reference_decode_no_checksum() {
3109 use haagenti_core::{Compressor, Decompressor};
3110
3111 let mut data = Vec::new();
3113 for i in 0..100u8 {
3114 data.push(i);
3115 }
3116 for i in 0..50u8 {
3117 data.push(i);
3118 }
3119
3120 println!("=== Test Reference Decode Without Checksum ===");
3121 println!("Input: {} bytes", data.len());
3122
3123 let compressor = ZstdCompressor::new();
3124 let compressed = compressor.compress(&data).expect("compress failed");
3125 println!("Original compressed: {} bytes", compressed.len());
3126 println!("Full bytes: {:02x?}", compressed);
3127
3128 let fhd = compressed[4];
3130 println!("\nFHD byte: 0x{:02x}", fhd);
3131 println!(" Content_Checksum_flag: {}", (fhd >> 2) & 1);
3132 println!(" Single_Segment_flag: {}", (fhd >> 5) & 1);
3133
3134 let mut modified = compressed.clone();
3136
3137 modified[4] = fhd & !0x04;
3139 println!("\nModified FHD byte: 0x{:02x}", modified[4]);
3140
3141 modified.truncate(modified.len() - 4);
3143 println!("Modified compressed: {} bytes", modified.len());
3144 println!("Modified bytes: {:02x?}", modified);
3145
3146 match zstd::decode_all(&modified[..]) {
3148 Ok(decoded) => {
3149 println!(
3150 "SUCCESS! Reference decoded without checksum: {} bytes",
3151 decoded.len()
3152 );
3153 if decoded == data {
3154 println!("Data matches! Issue is CHECKSUM, not block encoding");
3155 } else {
3156 println!("Data mismatch! Both checksum AND block encoding have issues");
3157 println!("Expected first 20: {:?}", &data[..20]);
3158 println!("Got first 20: {:?}", &decoded[..20.min(decoded.len())]);
3159 }
3160 }
3161 Err(e) => {
3162 println!("FAILED even without checksum: {:?}", e);
3163 println!("Issue is in BLOCK ENCODING, not checksum");
3164
3165 let decompressor = ZstdDecompressor::new();
3167 match decompressor.decompress(&modified) {
3168 Ok(decoded) => {
3169 println!("Our decoder succeeded on modified: {} bytes", decoded.len());
3170 }
3171 Err(e2) => {
3172 println!("Our decoder also failed on modified: {:?}", e2);
3173 }
3174 }
3175 }
3176 }
3177 }
3178
3179 #[test]
3181 fn test_debug_fse_state_values() {
3182 use crate::block::Sequence;
3183 use crate::compress::EncodedSequence;
3184 use crate::fse::{
3185 FseBitWriter, FseTable, InterleavedTansEncoder, LITERAL_LENGTH_ACCURACY_LOG,
3186 LITERAL_LENGTH_DEFAULT_DISTRIBUTION, MATCH_LENGTH_ACCURACY_LOG,
3187 MATCH_LENGTH_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION,
3188 };
3189
3190 println!("=== Debug FSE State Values ===");
3191
3192 let seq = Sequence::new(100, 103, 50);
3200 let encoded = EncodedSequence::from_sequence(&seq);
3201
3202 println!(
3203 "Sequence: ll={}, of={}, ml={}",
3204 seq.literal_length, seq.offset, seq.match_length
3205 );
3206 println!(
3207 "Encoded: ll_code={}, of_code={}, ml_code={}",
3208 encoded.ll_code, encoded.of_code, encoded.ml_code
3209 );
3210 println!(
3211 "Extra bits: ll={}({} bits), of={}({} bits), ml={}({} bits)",
3212 encoded.ll_extra,
3213 encoded.ll_bits,
3214 encoded.of_extra,
3215 encoded.of_code,
3216 encoded.ml_extra,
3217 encoded.ml_bits
3218 );
3219
3220 let ll_table = FseTable::from_predefined(
3222 &LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
3223 LITERAL_LENGTH_ACCURACY_LOG,
3224 )
3225 .unwrap();
3226 let of_table =
3227 FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
3228 let ml_table = FseTable::from_predefined(
3229 &MATCH_LENGTH_DEFAULT_DISTRIBUTION,
3230 MATCH_LENGTH_ACCURACY_LOG,
3231 )
3232 .unwrap();
3233
3234 println!(
3235 "\nTable sizes: LL={}, OF={}, ML={}",
3236 ll_table.size(),
3237 of_table.size(),
3238 ml_table.size()
3239 );
3240 println!(
3241 "Accuracy logs: LL={}, OF={}, ML={}",
3242 LITERAL_LENGTH_ACCURACY_LOG, OFFSET_ACCURACY_LOG, MATCH_LENGTH_ACCURACY_LOG
3243 );
3244
3245 let mut tans = InterleavedTansEncoder::new(&ll_table, &of_table, &ml_table);
3247
3248 tans.init_states(encoded.ll_code, encoded.of_code, encoded.ml_code);
3250 let (ll_state, of_state, ml_state) = tans.get_states();
3251
3252 println!(
3253 "\nAfter init_states({}, {}, {}):",
3254 encoded.ll_code, encoded.of_code, encoded.ml_code
3255 );
3256 println!(" LL state: {}", ll_state);
3257 println!(" OF state: {}", of_state);
3258 println!(" ML state: {}", ml_state);
3259
3260 let mut bits = FseBitWriter::new();
3262
3263 bits.write_bits(encoded.of_extra, encoded.of_code); bits.write_bits(encoded.ml_extra, encoded.ml_bits); bits.write_bits(encoded.ll_extra, encoded.ll_bits); let (ll_log, of_log, ml_log) = tans.accuracy_logs();
3270 bits.write_bits(ml_state, ml_log);
3271 bits.write_bits(of_state, of_log);
3272 bits.write_bits(ll_state, ll_log);
3273
3274 let bitstream = bits.finish();
3275 println!("\nOur bitstream: {:02x?}", bitstream);
3276
3277 println!("Reference bitstream: [e4, 67, 14, a2]");
3279
3280 let our_16 = u16::from_le_bytes([bitstream[0], bitstream[1]]);
3282 let ref_16 = u16::from_le_bytes([0xe4, 0x67]);
3283 println!(
3284 "\nFirst 16 bits (le): ours=0x{:04x} ref=0x{:04x}",
3285 our_16, ref_16
3286 );
3287 println!("Ours binary: {:016b}", our_16);
3288 println!("Ref binary: {:016b}", ref_16);
3289
3290 println!("\n=== Decode table positions ===");
3292 println!("LL code {} appears at positions:", encoded.ll_code);
3293 for pos in 0..ll_table.size() {
3294 let entry = ll_table.decode(pos);
3295 if entry.symbol == encoded.ll_code {
3296 println!(
3297 " Position {}: symbol={}, nb_bits={}, baseline={}",
3298 pos, entry.symbol, entry.num_bits, entry.baseline
3299 );
3300 }
3301 }
3302
3303 println!("OF code {} appears at positions:", encoded.of_code);
3304 for pos in 0..of_table.size() {
3305 let entry = of_table.decode(pos);
3306 if entry.symbol == encoded.of_code {
3307 println!(
3308 " Position {}: symbol={}, nb_bits={}, baseline={}",
3309 pos, entry.symbol, entry.num_bits, entry.baseline
3310 );
3311 }
3312 }
3313
3314 println!("ML code {} appears at positions:", encoded.ml_code);
3315 for pos in 0..ml_table.size() {
3316 let entry = ml_table.decode(pos);
3317 if entry.symbol == encoded.ml_code {
3318 println!(
3319 " Position {}: symbol={}, nb_bits={}, baseline={}",
3320 pos, entry.symbol, entry.num_bits, entry.baseline
3321 );
3322 }
3323 }
3324 }
3325
3326 #[test]
3328 fn test_compare_block_structure() {
3329 use haagenti_core::Compressor;
3330
3331 let mut data = Vec::new();
3333 for i in 0..100u8 {
3334 data.push(i);
3335 }
3336 for i in 0..50u8 {
3337 data.push(i);
3338 }
3339
3340 println!("=== Compare Block Structure ===");
3341 println!("Input: {} bytes", data.len());
3342
3343 let ref_compressed = zstd::encode_all(&data[..], 1).expect("ref compress failed");
3345 println!("\nReference compressed: {} bytes", ref_compressed.len());
3346 println!("Reference bytes: {:02x?}", ref_compressed);
3347
3348 let ref_fhd = ref_compressed[4];
3350 println!("\nReference FHD: 0x{:02x}", ref_fhd);
3351
3352 let compressor = ZstdCompressor::new();
3354 let our_compressed = compressor.compress(&data).expect("our compress failed");
3355 println!("\nOur compressed: {} bytes", our_compressed.len());
3356 println!("Our bytes: {:02x?}", our_compressed);
3357
3358 let our_fhd = our_compressed[4];
3360 println!("\nOur FHD: 0x{:02x}", our_fhd);
3361
3362 let ref_single_segment = (ref_fhd >> 5) & 1 == 1;
3368 let ref_has_checksum = (ref_fhd >> 2) & 1 == 1;
3369 let ref_fcs_size = match ref_fhd >> 6 {
3370 0 if ref_single_segment => 1,
3371 0 => 0,
3372 1 => 2,
3373 2 => 4,
3374 3 => 8,
3375 _ => 0,
3376 };
3377 let ref_window_present = !ref_single_segment;
3378 let ref_header_size = 1 + (if ref_window_present { 1 } else { 0 }) + ref_fcs_size;
3379 println!("\nReference frame header size: {} bytes", ref_header_size);
3380 println!(" Single segment: {}", ref_single_segment);
3381 println!(" Has checksum: {}", ref_has_checksum);
3382
3383 let our_single_segment = (our_fhd >> 5) & 1 == 1;
3385 let our_has_checksum = (our_fhd >> 2) & 1 == 1;
3386 let our_fcs_size = match our_fhd >> 6 {
3387 0 if our_single_segment => 1,
3388 0 => 0,
3389 1 => 2,
3390 2 => 4,
3391 3 => 8,
3392 _ => 0,
3393 };
3394 let our_window_present = !our_single_segment;
3395 let our_header_size = 1 + (if our_window_present { 1 } else { 0 }) + our_fcs_size;
3396 println!("\nOur frame header size: {} bytes", our_header_size);
3397 println!(" Single segment: {}", our_single_segment);
3398 println!(" Has checksum: {}", our_has_checksum);
3399
3400 let ref_block_start = 4 + ref_header_size;
3402 let our_block_start = 4 + our_header_size;
3403
3404 println!(
3405 "\nReference block header at offset {}: {:02x?}",
3406 ref_block_start,
3407 &ref_compressed[ref_block_start..ref_block_start + 3]
3408 );
3409 println!(
3410 "Our block header at offset {}: {:02x?}",
3411 our_block_start,
3412 &our_compressed[our_block_start..our_block_start + 3]
3413 );
3414
3415 let ref_block_header = u32::from_le_bytes([
3417 ref_compressed[ref_block_start],
3418 ref_compressed[ref_block_start + 1],
3419 ref_compressed[ref_block_start + 2],
3420 0,
3421 ]);
3422 let ref_is_last = ref_block_header & 1 == 1;
3423 let ref_block_type = (ref_block_header >> 1) & 3;
3424 let ref_block_size = ref_block_header >> 3;
3425
3426 let our_block_header = u32::from_le_bytes([
3427 our_compressed[our_block_start],
3428 our_compressed[our_block_start + 1],
3429 our_compressed[our_block_start + 2],
3430 0,
3431 ]);
3432 let our_is_last = our_block_header & 1 == 1;
3433 let our_block_type = (our_block_header >> 1) & 3;
3434 let our_block_size = our_block_header >> 3;
3435
3436 println!(
3437 "\nReference block: is_last={}, type={}, size={}",
3438 ref_is_last, ref_block_type, ref_block_size
3439 );
3440 println!(
3441 "Our block: is_last={}, type={}, size={}",
3442 our_is_last, our_block_type, our_block_size
3443 );
3444
3445 let ref_block_content_start = ref_block_start + 3;
3447 let our_block_content_start = our_block_start + 3;
3448
3449 println!("\n=== Literals Section ===");
3451 let ref_lit_header = ref_compressed[ref_block_content_start];
3452 let our_lit_header = our_compressed[our_block_content_start];
3453 println!("Reference literals header: 0x{:02x}", ref_lit_header);
3454 println!("Our literals header: 0x{:02x}", our_lit_header);
3455
3456 let ref_lit_type = ref_lit_header & 3;
3457 let our_lit_type = our_lit_header & 3;
3458 println!(
3459 "Reference literals type: {} (0=Raw, 1=RLE, 2=Compressed, 3=Treeless)",
3460 ref_lit_type
3461 );
3462 println!(
3463 "Our literals type: {} (0=Raw, 1=RLE, 2=Compressed, 3=Treeless)",
3464 our_lit_type
3465 );
3466
3467 let ref_remaining = &ref_compressed[ref_block_content_start..];
3470 let our_remaining = &our_compressed[our_block_content_start..];
3471
3472 let ref_block_end = ref_block_content_start + ref_block_size as usize;
3474 let our_block_end = our_block_content_start + our_block_size as usize;
3475
3476 if ref_block_end <= ref_compressed.len() {
3477 println!(
3478 "\nReference block last 15 bytes: {:02x?}",
3479 &ref_compressed[ref_block_end.saturating_sub(15)..ref_block_end]
3480 );
3481 }
3482 if our_block_end <= our_compressed.len() {
3483 println!(
3484 "Our block last 15 bytes: {:02x?}",
3485 &our_compressed[our_block_end.saturating_sub(15)..our_block_end]
3486 );
3487 }
3488 }
3489
3490 #[test]
3492 fn test_xxhash64_against_known_values() {
3493 use crate::frame::xxhash64;
3494
3495 println!("=== XXHash64 Verification ===");
3496
3497 let empty_hash = xxhash64(&[], 0);
3500 println!("xxhash64('', 0) = 0x{:016x}", empty_hash);
3501 let expected_empty = 0xEF46DB3751D8E999u64;
3502 println!("Expected: 0x{:016x}", expected_empty);
3503 if empty_hash == expected_empty {
3504 println!(" ✓ MATCH");
3505 } else {
3506 println!(" ✗ MISMATCH");
3507 }
3508
3509 let hello_hash = xxhash64(b"Hello", 0);
3512 println!("\nxxhash64('Hello', 0) = 0x{:016x}", hello_hash);
3513
3514 let digits_hash = xxhash64(b"0123456789", 0);
3516 println!("xxhash64('0123456789', 0) = 0x{:016x}", digits_hash);
3517
3518 let mut test_data = Vec::new();
3524 for i in 0..100u8 {
3525 test_data.push(i);
3526 }
3527 for i in 0..50u8 {
3528 test_data.push(i);
3529 }
3530
3531 let our_hash = xxhash64(&test_data, 0);
3532 let our_checksum = (our_hash & 0xFFFFFFFF) as u32;
3533 println!("\nFor 150-byte test data:");
3534 println!(" Our full xxhash64: 0x{:016x}", our_hash);
3535 println!(" Our 32-bit checksum: 0x{:08x}", our_checksum);
3536
3537 let ref_compressed = zstd::encode_all(&test_data[..], 1).expect("ref compress failed");
3539 println!("\nReference compressed: {} bytes", ref_compressed.len());
3540
3541 let ref_fhd = ref_compressed[4];
3543 println!("Reference FHD: 0x{:02x}", ref_fhd);
3544 let has_checksum = (ref_fhd >> 2) & 1 == 1;
3545 println!("Reference has checksum: {}", has_checksum);
3546
3547 if has_checksum {
3548 let ref_checksum = u32::from_le_bytes([
3550 ref_compressed[ref_compressed.len() - 4],
3551 ref_compressed[ref_compressed.len() - 3],
3552 ref_compressed[ref_compressed.len() - 2],
3553 ref_compressed[ref_compressed.len() - 1],
3554 ]);
3555 println!("Reference 32-bit checksum: 0x{:08x}", ref_checksum);
3556
3557 if our_checksum == ref_checksum {
3558 println!(" ✓ CHECKSUMS MATCH!");
3559 } else {
3560 println!(" ✗ CHECKSUMS DIFFER!");
3561 }
3562 }
3563 }
3564
3565 #[test]
3567 fn test_debug_of_init_state() {
3568 use crate::fse::TansEncoder;
3569 use crate::fse::{
3570 FseTable, InterleavedTansEncoder, LITERAL_LENGTH_ACCURACY_LOG,
3571 LITERAL_LENGTH_DEFAULT_DISTRIBUTION, MATCH_LENGTH_ACCURACY_LOG,
3572 MATCH_LENGTH_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION,
3573 };
3574
3575 let of_table =
3576 FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
3577
3578 println!("=== Debug OF Init State for Code 5 ===");
3579 println!("OF accuracy log: {}", OFFSET_ACCURACY_LOG);
3580 println!("OF table size: {}", of_table.size());
3581
3582 println!("\nOF Decode Table:");
3584 println!(" Positions where symbol 5 appears:");
3585 for pos in 0..of_table.size() {
3586 let entry = of_table.decode(pos);
3587 if entry.symbol == 5 {
3588 println!(
3589 " Position {} -> symbol={}, nb_bits={}, baseline={}",
3590 pos, entry.symbol, entry.num_bits, entry.baseline
3591 );
3592 }
3593 }
3594
3595 println!("\n All positions:");
3597 for pos in 0..of_table.size() {
3598 let entry = of_table.decode(pos);
3599 println!(
3600 " {:2}: symbol={:2}, nb_bits={}, baseline={:2}",
3601 pos, entry.symbol, entry.num_bits, entry.baseline
3602 );
3603 }
3604
3605 let mut encoder = TansEncoder::from_decode_table(&of_table);
3607 encoder.init_state(5);
3608 let single_output_state = encoder.get_state();
3609 println!("\nSingle OF encoder:");
3610 println!(" init_state(5) -> output state = {}", single_output_state);
3611
3612 let ll_table = FseTable::from_predefined(
3614 &LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
3615 LITERAL_LENGTH_ACCURACY_LOG,
3616 )
3617 .unwrap();
3618 let ml_table = FseTable::from_predefined(
3619 &MATCH_LENGTH_DEFAULT_DISTRIBUTION,
3620 MATCH_LENGTH_ACCURACY_LOG,
3621 )
3622 .unwrap();
3623
3624 let mut interleaved = InterleavedTansEncoder::new(&ll_table, &of_table, &ml_table);
3625
3626 interleaved.init_states(23, 5, 17);
3629 let (ll_state, of_state, ml_state) = interleaved.get_states();
3630
3631 println!("\nInterleaved encoder (like sequence encoding):");
3632 println!(" init_states(23, 5, 17) -> states:");
3633 println!(" LL = {}", ll_state);
3634 println!(" OF = {}", of_state);
3635 println!(" ML = {}", ml_state);
3636 println!(" Expected OF = 18 (position 18 in decode table)");
3637 println!(" Expected LL = 38 (position 38 in decode table)");
3638
3639 let entry18 = of_table.decode(18);
3641 println!(
3642 "\n Position 18 has: symbol={}, nb_bits={}, baseline={}",
3643 entry18.symbol, entry18.num_bits, entry18.baseline
3644 );
3645 }
3646
3647 fn decode_bitstream_bits(name: &str, bytes: &[u8]) {
3648 if bytes.is_empty() {
3649 println!(" {} is empty", name);
3650 return;
3651 }
3652
3653 println!(" {} bits:", name);
3654
3655 let last = bytes[bytes.len() - 1];
3657 let sentinel_pos = 31 - (last as u32).leading_zeros();
3658 println!(
3659 " Last byte: 0x{:02x}, sentinel at bit {}",
3660 last, sentinel_pos
3661 );
3662
3663 let total_bits = (bytes.len() - 1) * 8 + sentinel_pos as usize;
3665 println!(" Total data bits: {}", total_bits);
3666
3667 let mut bit_pos = 0;
3672 let mut bit_buffer: u64 = 0;
3673 let mut bits_in_buffer = 0;
3674
3675 for &b in bytes.iter().rev() {
3677 bit_buffer |= (b as u64) << bits_in_buffer;
3678 bits_in_buffer += 8;
3679 }
3680
3681 bits_in_buffer = total_bits;
3683 bit_buffer &= (1u64 << bits_in_buffer) - 1;
3684
3685 let ll_state = (bit_buffer >> (bits_in_buffer - 6)) & 0x3F;
3687 let of_state = (bit_buffer >> (bits_in_buffer - 6 - 5)) & 0x1F;
3688 let ml_state = (bit_buffer >> (bits_in_buffer - 6 - 5 - 6)) & 0x3F;
3689
3690 println!(
3691 " Initial states: LL={} OF={} ML={}",
3692 ll_state, of_state, ml_state
3693 );
3694
3695 let remaining = bits_in_buffer - 17;
3697 println!(" Remaining bits after states: {}", remaining);
3698 }
3699
3700 #[test]
3704 fn test_reference_zstd_comparison() {
3705 use haagenti_core::{Compressor, Decompressor};
3706
3707 let mut data = Vec::new();
3710
3711 for i in 0..100u8 {
3713 data.push(i);
3714 }
3715
3716 for i in 0..50u8 {
3718 data.push(i); }
3720
3721 data.push(0xAA);
3723 data.push(0xBB);
3724 data.push(0xCC);
3725
3726 for i in 50..80u8 {
3728 data.push(i); }
3730
3731 println!("=== Reference Zstd Comparison ===");
3732 println!(
3733 "Input data ({} bytes): {:?}",
3734 data.len(),
3735 String::from_utf8_lossy(&data)
3736 );
3737
3738 let ref_compressed =
3740 zstd::encode_all(&data[..], 3).expect("reference zstd compress failed");
3741 println!(
3742 "\nReference zstd compressed: {} bytes",
3743 ref_compressed.len()
3744 );
3745 println!("Reference bytes: {:02x?}", ref_compressed);
3746
3747 parse_zstd_frame("Reference", &ref_compressed);
3749
3750 let compressor = ZstdCompressor::new();
3752 let our_compressed = compressor.compress(&data).expect("our compress failed");
3753 println!(
3754 "\nOur implementation compressed: {} bytes",
3755 our_compressed.len()
3756 );
3757 println!("Our bytes: {:02x?}", our_compressed);
3758
3759 parse_zstd_frame("Ours", &our_compressed);
3761
3762 let ref_decompressed =
3764 zstd::decode_all(&ref_compressed[..]).expect("reference decode failed");
3765 assert_eq!(&ref_decompressed, &data, "Reference roundtrip failed");
3766
3767 println!("\n=== Decoding Tests ===");
3769 match zstd::decode_all(&our_compressed[..]) {
3770 Ok(decoded) => {
3771 println!("Reference zstd decoded our output: {} bytes", decoded.len());
3772 if decoded == data {
3773 println!("Reference zstd roundtrip SUCCEEDED!");
3774 } else {
3775 println!("Reference zstd decoded WRONG data!");
3776 println!("Expected {} bytes, got {} bytes", data.len(), decoded.len());
3777 }
3778 }
3779 Err(e) => {
3780 println!("Reference zstd FAILED to decode our output: {:?}", e);
3781 }
3782 }
3783
3784 let decompressor = ZstdDecompressor::new();
3786 match decompressor.decompress(&our_compressed) {
3787 Ok(decoded) => {
3788 println!("Our decoder succeeded: {} bytes", decoded.len());
3789 assert_eq!(&decoded, &data, "Our roundtrip failed");
3790 }
3791 Err(e) => {
3792 println!("Our decoder FAILED: {:?}", e);
3793 }
3794 }
3795
3796 println!("\n=== Done ===");
3797 }
3798
3799 fn parse_zstd_frame(name: &str, data: &[u8]) {
3801 println!("\n--- {} Frame Structure ---", name);
3802
3803 if data.len() < 4 {
3804 println!("Frame too short!");
3805 return;
3806 }
3807
3808 let magic = u32::from_le_bytes([data[0], data[1], data[2], data[3]]);
3810 println!("Magic: 0x{:08x} (expected: 0xFD2FB528)", magic);
3811
3812 if data.len() < 5 {
3813 return;
3814 }
3815
3816 let fhd = data[4];
3818 let fcs_size = match (fhd >> 6) & 0x3 {
3819 0 => {
3820 if fhd & 0x20 != 0 {
3821 1
3822 } else {
3823 0
3824 }
3825 }
3826 1 => 2,
3827 2 => 4,
3828 3 => 8,
3829 _ => 0,
3830 };
3831 let single_segment = (fhd >> 5) & 0x1 != 0;
3832 let content_checksum = (fhd >> 2) & 0x1 != 0;
3833 let dict_id_size = match fhd & 0x3 {
3834 0 => 0,
3835 1 => 1,
3836 2 => 2,
3837 3 => 4,
3838 _ => 0,
3839 };
3840
3841 println!("Frame Header Descriptor: 0x{:02x}", fhd);
3842 println!(" - FCS size: {} bytes", fcs_size);
3843 println!(" - Single segment: {}", single_segment);
3844 println!(" - Content checksum: {}", content_checksum);
3845 println!(" - Dict ID size: {} bytes", dict_id_size);
3846
3847 let window_desc_offset = if single_segment { 0 } else { 1 };
3848 let header_size = 5 + window_desc_offset + dict_id_size + fcs_size;
3849
3850 println!("Header ends at byte {}", header_size);
3851
3852 if data.len() > header_size {
3853 let block_start = header_size;
3855 if block_start + 3 <= data.len() {
3856 let bh0 = data[block_start] as u32;
3857 let bh1 = data[block_start + 1] as u32;
3858 let bh2 = data[block_start + 2] as u32;
3859 let block_header = bh0 | (bh1 << 8) | (bh2 << 16);
3860
3861 let last_block = block_header & 0x1 != 0;
3862 let block_type = (block_header >> 1) & 0x3;
3863 let block_size = (block_header >> 3) as usize;
3864
3865 println!("\nFirst Block at offset {}:", block_start);
3866 println!(
3867 " - Block header bytes: {:02x} {:02x} {:02x}",
3868 bh0, bh1, bh2
3869 );
3870 println!(" - Last block: {}", last_block);
3871 println!(
3872 " - Block type: {} ({})",
3873 block_type,
3874 match block_type {
3875 0 => "Raw",
3876 1 => "RLE",
3877 2 => "Compressed",
3878 3 => "Reserved",
3879 _ => "Unknown",
3880 }
3881 );
3882 println!(" - Block size: {} bytes", block_size);
3883
3884 let block_content_start = block_start + 3;
3886 let block_content_end = (block_content_start + block_size).min(data.len());
3887 println!(
3888 "\nBlock content ({} bytes):",
3889 block_content_end - block_content_start
3890 );
3891 for (i, chunk) in data[block_content_start..block_content_end]
3892 .chunks(16)
3893 .enumerate()
3894 {
3895 print!(" {:04x}: ", i * 16);
3896 for b in chunk {
3897 print!("{:02x} ", b);
3898 }
3899 println!();
3900 }
3901 }
3902 }
3903 }
3904
3905 #[test]
3908 fn test_fse_bytes_in_reference_frame() {
3909 let ref_frame: Vec<u8> = vec![
3915 0x28, 0xb5, 0x2f, 0xfd, 0x00, 0x48, 0x55, 0x00, 0x00, 0x20, 0x41, 0x42, 0x43, 0x44, 0x01, 0x00, 0xfd, 0xe4, 0x88, ];
3925
3926 println!("=== Test FSE Bytes in Reference Frame ===");
3927 println!("Reference frame: {:02x?}", ref_frame);
3928
3929 match zstd::decode_all(&ref_frame[..]) {
3930 Ok(decoded) => {
3931 println!(
3932 "Reference frame with reference FSE: SUCCESS ({} bytes)",
3933 decoded.len()
3934 );
3935 println!(" Decoded: {:?}", String::from_utf8_lossy(&decoded));
3936 }
3937 Err(e) => {
3938 println!("Reference frame with reference FSE: FAILED {:?}", e);
3939 }
3940 }
3941
3942 let mut our_fse_frame = ref_frame.clone();
3944 our_fse_frame[16] = 0xf7; println!("\nOur FSE frame: {:02x?}", our_fse_frame);
3947
3948 match zstd::decode_all(&our_fse_frame[..]) {
3949 Ok(decoded) => {
3950 println!(
3951 "Reference frame with OUR FSE: SUCCESS ({} bytes)",
3952 decoded.len()
3953 );
3954 println!(" Decoded: {:?}", String::from_utf8_lossy(&decoded));
3955 }
3956 Err(e) => {
3957 println!("Reference frame with OUR FSE: FAILED {:?}", e);
3958 println!("This confirms FSE encoding difference is the issue");
3959 }
3960 }
3961 }
3962}
3963
3964#[cfg(test)]
3966mod profiling_tests {
3967 use crate::compress::block::matches_to_sequences;
3968 use crate::compress::{
3969 analyze_for_rle, CompressContext, EncodedSequence, LazyMatchFinder, MatchFinder,
3970 };
3971 use crate::huffman::HuffmanEncoder;
3972 use crate::{ZstdCompressor, ZstdDecompressor};
3973 use haagenti_core::{CompressionLevel, Compressor, Decompressor};
3974
3975 #[derive(Debug, Default)]
3977 struct CompressionProfile {
3978 input_size: usize,
3979 output_size: usize,
3980 num_matches: usize,
3982 total_match_bytes: usize,
3983 literal_bytes: usize,
3984 avg_match_length: f64,
3985 avg_offset: f64,
3986 num_sequences: usize,
3988 rle_suitable: bool,
3989 ll_codes_unique: usize,
3990 of_codes_unique: usize,
3991 ml_codes_unique: usize,
3992 huffman_viable: bool,
3994 huffman_estimated_size: usize,
3995 zstd_size: usize,
3997 }
3998
3999 fn profile_compression(data: &[u8], level: CompressionLevel) -> CompressionProfile {
4000 let mut profile = CompressionProfile {
4001 input_size: data.len(),
4002 ..Default::default()
4003 };
4004
4005 let matches = match level {
4007 CompressionLevel::Fast | CompressionLevel::None => {
4008 let mut mf = MatchFinder::new(4);
4009 mf.find_matches(data)
4010 }
4011 _ => {
4012 let mut mf = LazyMatchFinder::new(16);
4013 mf.find_matches(data)
4014 }
4015 };
4016
4017 profile.num_matches = matches.len();
4018 if !matches.is_empty() {
4019 let total_len: usize = matches.iter().map(|m| m.length).sum();
4020 let total_off: usize = matches.iter().map(|m| m.offset).sum();
4021 profile.total_match_bytes = total_len;
4022 profile.avg_match_length = total_len as f64 / matches.len() as f64;
4023 profile.avg_offset = total_off as f64 / matches.len() as f64;
4024 }
4025
4026 let (literals, sequences) = matches_to_sequences(data, &matches);
4028 profile.literal_bytes = literals.len();
4029 profile.num_sequences = sequences.len();
4030
4031 let suitability = analyze_for_rle(&sequences);
4032 profile.rle_suitable = suitability.all_uniform();
4033
4034 if !sequences.is_empty() {
4036 use std::collections::HashSet;
4037
4038 let encoded: Vec<_> = sequences
4039 .iter()
4040 .map(|s| EncodedSequence::from_sequence(s))
4041 .collect();
4042
4043 let ll_codes: HashSet<_> = encoded.iter().map(|e| e.ll_code).collect();
4044 let of_codes: HashSet<_> = encoded.iter().map(|e| e.of_code).collect();
4045 let ml_codes: HashSet<_> = encoded.iter().map(|e| e.ml_code).collect();
4046
4047 profile.ll_codes_unique = ll_codes.len();
4048 profile.of_codes_unique = of_codes.len();
4049 profile.ml_codes_unique = ml_codes.len();
4050 }
4051
4052 if literals.len() >= 64 {
4054 if let Some(encoder) = HuffmanEncoder::build(&literals) {
4055 profile.huffman_viable = true;
4056 profile.huffman_estimated_size = encoder.estimate_size(&literals);
4057 }
4058 }
4059
4060 let mut ctx = CompressContext::new(level);
4062 if let Ok(compressed) = ctx.compress(data) {
4063 profile.output_size = compressed.len();
4064 }
4065
4066 if let Ok(zstd_compressed) = zstd::encode_all(data, 3) {
4068 profile.zstd_size = zstd_compressed.len();
4069 }
4070
4071 profile
4072 }
4073
4074 fn print_profile(name: &str, p: &CompressionProfile) {
4075 println!("\n=== {} ===", name);
4076 println!("Input: {} bytes", p.input_size);
4077 println!();
4078 println!("MATCH FINDING:");
4079 println!(" Matches found: {}", p.num_matches);
4080 println!(
4081 " Match coverage: {} bytes ({:.1}%)",
4082 p.total_match_bytes,
4083 100.0 * p.total_match_bytes as f64 / p.input_size as f64
4084 );
4085 println!(
4086 " Literal bytes: {} ({:.1}%)",
4087 p.literal_bytes,
4088 100.0 * p.literal_bytes as f64 / p.input_size as f64
4089 );
4090 println!(" Avg match length: {:.1}", p.avg_match_length);
4091 println!(" Avg offset: {:.1}", p.avg_offset);
4092 println!();
4093 println!("SEQUENCES:");
4094 println!(" Sequences: {}", p.num_sequences);
4095 println!(" RLE suitable: {}", p.rle_suitable);
4096 println!(" Unique LL codes: {}", p.ll_codes_unique);
4097 println!(" Unique OF codes: {}", p.of_codes_unique);
4098 println!(" Unique ML codes: {}", p.ml_codes_unique);
4099 println!();
4100 println!("LITERALS:");
4101 println!(" Huffman viable: {}", p.huffman_viable);
4102 if p.huffman_viable {
4103 println!(
4104 " Huffman estimated: {} bytes ({:.1}% of literals)",
4105 p.huffman_estimated_size,
4106 100.0 * p.huffman_estimated_size as f64 / p.literal_bytes.max(1) as f64
4107 );
4108 }
4109 println!();
4110 println!("OUTPUT:");
4111 println!(
4112 " Haagenti: {} bytes ({:.2}x ratio)",
4113 p.output_size,
4114 p.input_size as f64 / p.output_size.max(1) as f64
4115 );
4116 println!(
4117 " Zstd ref: {} bytes ({:.2}x ratio)",
4118 p.zstd_size,
4119 p.input_size as f64 / p.zstd_size.max(1) as f64
4120 );
4121 println!(
4122 " Gap: {} bytes ({:.1}% larger)",
4123 p.output_size as i64 - p.zstd_size as i64,
4124 100.0 * (p.output_size as f64 / p.zstd_size.max(1) as f64 - 1.0)
4125 );
4126 }
4127
4128 fn generate_text(size: usize) -> Vec<u8> {
4129 let pattern = b"The quick brown fox jumps over the lazy dog. ";
4130 let mut data = Vec::with_capacity(size);
4131 while data.len() < size {
4132 data.extend_from_slice(pattern);
4133 }
4134 data.truncate(size);
4135 data
4136 }
4137
4138 fn generate_random_text(size: usize, seed: u64) -> Vec<u8> {
4139 use rand::rngs::StdRng;
4140 use rand::{Rng, SeedableRng};
4141
4142 let words = [
4143 "the ",
4144 "quick ",
4145 "brown ",
4146 "fox ",
4147 "jumps ",
4148 "over ",
4149 "lazy ",
4150 "dog ",
4151 "compression ",
4152 "algorithm ",
4153 "data ",
4154 "stream ",
4155 "entropy ",
4156 ];
4157 let mut rng = StdRng::seed_from_u64(seed);
4158 let mut data = Vec::with_capacity(size);
4159 while data.len() < size {
4160 let word = words[rng.gen_range(0..words.len())];
4161 data.extend_from_slice(word.as_bytes());
4162 }
4163 data.truncate(size);
4164 data
4165 }
4166
4167 fn generate_binary(size: usize, seed: u64) -> Vec<u8> {
4168 use rand::rngs::StdRng;
4169 use rand::{Rng, SeedableRng};
4170
4171 let mut rng = StdRng::seed_from_u64(seed);
4172 (0..size).map(|_| rng.r#gen::<u8>()).collect()
4173 }
4174
4175 #[test]
4176 fn test_profile_text_patterns() {
4177 println!("\n========== COMPRESSION PROFILING ==========\n");
4178
4179 let data = generate_text(16384);
4181 let profile = profile_compression(&data, CompressionLevel::Default);
4182 print_profile("16KB Repeating Text", &profile);
4183
4184 let data = generate_random_text(16384, 12345);
4186 let profile = profile_compression(&data, CompressionLevel::Default);
4187 print_profile("16KB Random Text", &profile);
4188
4189 let data = generate_text(65536);
4191 let profile = profile_compression(&data, CompressionLevel::Default);
4192 print_profile("64KB Repeating Text", &profile);
4193
4194 let data = generate_binary(16384, 54321);
4196 let profile = profile_compression(&data, CompressionLevel::Default);
4197 print_profile("16KB Random Binary", &profile);
4198 }
4199
4200 #[test]
4201 fn test_profile_match_finder_quality() {
4202 println!("\n========== MATCH FINDER ANALYSIS ==========\n");
4203
4204 let data = generate_text(16384);
4205
4206 let mut greedy_mf = MatchFinder::new(4);
4208 let greedy_matches = greedy_mf.find_matches(&data);
4209
4210 let mut lazy_mf = LazyMatchFinder::new(16);
4212 let lazy_matches = lazy_mf.find_matches(&data);
4213
4214 println!("Greedy (depth=4):");
4215 println!(" Matches: {}", greedy_matches.len());
4216 if !greedy_matches.is_empty() {
4217 let total: usize = greedy_matches.iter().map(|m| m.length).sum();
4218 println!(
4219 " Coverage: {} bytes ({:.1}%)",
4220 total,
4221 100.0 * total as f64 / data.len() as f64
4222 );
4223 println!(
4224 " Avg length: {:.1}",
4225 total as f64 / greedy_matches.len() as f64
4226 );
4227 }
4228
4229 println!("\nLazy (depth=16):");
4230 println!(" Matches: {}", lazy_matches.len());
4231 if !lazy_matches.is_empty() {
4232 let total: usize = lazy_matches.iter().map(|m| m.length).sum();
4233 println!(
4234 " Coverage: {} bytes ({:.1}%)",
4235 total,
4236 100.0 * total as f64 / data.len() as f64
4237 );
4238 println!(
4239 " Avg length: {:.1}",
4240 total as f64 / lazy_matches.len() as f64
4241 );
4242 }
4243
4244 println!("\nMatch length distribution (Lazy):");
4246 let mut len_buckets = [0usize; 10];
4247 for m in &lazy_matches {
4248 let bucket = match m.length {
4249 3 => 0,
4250 4 => 1,
4251 5..=7 => 2,
4252 8..=15 => 3,
4253 16..=31 => 4,
4254 32..=63 => 5,
4255 64..=127 => 6,
4256 128..=255 => 7,
4257 256..=1023 => 8,
4258 _ => 9,
4259 };
4260 len_buckets[bucket] += 1;
4261 }
4262 println!(" 3: {}", len_buckets[0]);
4263 println!(" 4: {}", len_buckets[1]);
4264 println!(" 5-7: {}", len_buckets[2]);
4265 println!(" 8-15: {}", len_buckets[3]);
4266 println!(" 16-31: {}", len_buckets[4]);
4267 println!(" 32-63: {}", len_buckets[5]);
4268 println!(" 64-127: {}", len_buckets[6]);
4269 println!(" 128-255: {}", len_buckets[7]);
4270 println!(" 256-1023: {}", len_buckets[8]);
4271 println!(" 1024+: {}", len_buckets[9]);
4272 }
4273
4274 #[test]
4275 fn test_profile_sequence_encoding_paths() {
4276 println!("\n========== SEQUENCE ENCODING PATHS ==========\n");
4277
4278 let test_cases: Vec<(&str, Vec<u8>)> = vec![
4280 ("Uniform pattern (abcd repeat)", {
4281 let mut d = Vec::with_capacity(4096);
4282 while d.len() < 4096 {
4283 d.extend_from_slice(b"abcd");
4284 }
4285 d
4286 }),
4287 ("Semi-uniform (sentence repeat)", generate_text(4096)),
4288 ("Random text order", generate_random_text(4096, 999)),
4289 ("Mixed content", {
4290 let mut d = generate_text(2048);
4291 d.extend_from_slice(&generate_random_text(2048, 888));
4292 d
4293 }),
4294 ];
4295
4296 for (name, data) in test_cases {
4297 let mut mf = LazyMatchFinder::new(16);
4298 let matches = mf.find_matches(&data);
4299 let (literals, sequences) = matches_to_sequences(&data, &matches);
4300 let suitability = analyze_for_rle(&sequences);
4301
4302 use std::collections::HashSet;
4303 let (ll_unique, of_unique, ml_unique) = if sequences.is_empty() {
4304 (0, 0, 0)
4305 } else {
4306 let encoded: Vec<_> = sequences
4307 .iter()
4308 .map(|s| EncodedSequence::from_sequence(s))
4309 .collect();
4310 (
4311 encoded
4312 .iter()
4313 .map(|e| e.ll_code)
4314 .collect::<HashSet<_>>()
4315 .len(),
4316 encoded
4317 .iter()
4318 .map(|e| e.of_code)
4319 .collect::<HashSet<_>>()
4320 .len(),
4321 encoded
4322 .iter()
4323 .map(|e| e.ml_code)
4324 .collect::<HashSet<_>>()
4325 .len(),
4326 )
4327 };
4328
4329 println!(
4330 "{}: {} seqs, RLE={}, LL={} OF={} ML={} unique codes",
4331 name,
4332 sequences.len(),
4333 suitability.all_uniform(),
4334 ll_unique,
4335 of_unique,
4336 ml_unique,
4337 );
4338 }
4339 }
4340
4341 #[test]
4343 fn test_debug_single_byte_repeats() {
4344 let mut input = Vec::new();
4346 for _ in 0..10 {
4347 input.extend(vec![b'X'; 20]);
4348 input.extend(vec![b'Y'; 20]);
4349 }
4350 println!("Input: {} bytes", input.len());
4351 println!(
4352 "Pattern preview: {:?}",
4353 String::from_utf8_lossy(&input[..60])
4354 );
4355
4356 let mut mf = LazyMatchFinder::new(16);
4358 let matches = mf.find_matches(&input);
4359 println!("\nMatches found: {}", matches.len());
4360 for (i, m) in matches.iter().take(10).enumerate() {
4361 println!(
4362 " Match[{}]: pos={}, len={}, offset={}",
4363 i, m.position, m.length, m.offset
4364 );
4365 }
4366
4367 let (literals, seqs) = matches_to_sequences(&input, &matches);
4369 println!("\nLiterals: {} bytes", literals.len());
4370 println!("Sequences: {}", seqs.len());
4371
4372 let suitability = analyze_for_rle(&seqs);
4374 println!("RLE suitable: {}", suitability.all_uniform());
4375 println!(
4376 " LL uniform: {} (code={})",
4377 suitability.ll_uniform, suitability.ll_code
4378 );
4379 println!(
4380 " OF uniform: {} (code={})",
4381 suitability.of_uniform, suitability.of_code
4382 );
4383 println!(
4384 " ML uniform: {} (code={})",
4385 suitability.ml_uniform, suitability.ml_code
4386 );
4387
4388 if !seqs.is_empty() {
4390 let encoded: Vec<_> = seqs
4391 .iter()
4392 .map(|s| EncodedSequence::from_sequence(s))
4393 .collect();
4394 println!("\nFirst 5 encoded sequences:");
4395 for (i, e) in encoded.iter().take(5).enumerate() {
4396 println!(" Seq[{}]: ll_code={}, of_code={}, ml_code={}, ll_extra={}, of_extra={}, ml_extra={}",
4397 i, e.ll_code, e.of_code, e.ml_code, e.ll_extra, e.of_extra, e.ml_extra);
4398 }
4399 }
4400
4401 let compressor = ZstdCompressor::new();
4403 let compressed = compressor.compress(&input).expect("Compression failed");
4404 println!("\nCompressed: {} bytes", compressed.len());
4405
4406 println!("Full compressed data:");
4408 for (i, chunk) in compressed.chunks(16).enumerate() {
4409 print!(" {:04x}: ", i * 16);
4410 for &b in chunk {
4411 print!("{:02x} ", b);
4412 }
4413 println!();
4414 }
4415
4416 let decompressor = ZstdDecompressor::new();
4418 match decompressor.decompress(&compressed) {
4419 Ok(decompressed) => {
4420 println!("\nOur decompressor: SUCCESS, {} bytes", decompressed.len())
4421 }
4422 Err(e) => println!("\nOur decompressor: FAILED: {:?}", e),
4423 }
4424
4425 match zstd::decode_all(compressed.as_slice()) {
4426 Ok(decompressed) => println!("Reference zstd: SUCCESS, {} bytes", decompressed.len()),
4427 Err(e) => println!("Reference zstd: FAILED: {:?}", e),
4428 }
4429 }
4430}
4431
4432#[cfg(test)]
4433mod minimal_fse_debug {
4434 use crate::fse::{
4435 FseBitWriter, FseTable, InterleavedTansEncoder, LITERAL_LENGTH_ACCURACY_LOG,
4436 LITERAL_LENGTH_DEFAULT_DISTRIBUTION, MATCH_LENGTH_ACCURACY_LOG,
4437 MATCH_LENGTH_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION,
4438 };
4439
4440 #[test]
4441 fn test_single_sequence_bitstream_size() {
4442 let ll_code: u8 = 4;
4446 let of_code: u8 = 2;
4447 let ml_code: u8 = 41;
4448
4449 let of_extra: u32 = 0;
4453 let ml_extra: u32 = 13; let ml_bits: u8 = 4; println!(
4457 "Encoded (matching reference): ll_code={}, of_code={}, ml_code={}",
4458 ll_code, of_code, ml_code
4459 );
4460 println!("OF extra bits: {} bits, value {}", of_code, of_extra);
4461 println!("ML extra bits: {} bits, value {}", ml_bits, ml_extra);
4462
4463 let ll_table = FseTable::from_predefined(
4465 &LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
4466 LITERAL_LENGTH_ACCURACY_LOG,
4467 )
4468 .unwrap();
4469 let of_table =
4470 FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
4471 let ml_table = FseTable::from_predefined(
4472 &MATCH_LENGTH_DEFAULT_DISTRIBUTION,
4473 MATCH_LENGTH_ACCURACY_LOG,
4474 )
4475 .unwrap();
4476
4477 let mut tans = InterleavedTansEncoder::new(&ll_table, &of_table, &ml_table);
4478 let (ll_log, of_log, ml_log) = tans.accuracy_logs();
4479
4480 println!("Accuracy logs: ll={}, of={}, ml={}", ll_log, of_log, ml_log);
4481
4482 let mut bits = FseBitWriter::new();
4483
4484 tans.init_states(ll_code, of_code, ml_code);
4486 let (init_ll, init_of, init_ml) = tans.get_states();
4487 println!(
4488 "After init_states: ll_state={}, of_state={}, ml_state={}",
4489 init_ll, init_of, init_ml
4490 );
4491
4492 let (ll_state, of_state, ml_state) = tans.get_states();
4497 println!(
4498 "States (from init): ll={}, of={}, ml={}",
4499 ll_state, of_state, ml_state
4500 );
4501
4502 if of_code > 0 {
4510 println!("Writing OF extra: value={}, bits={}", of_extra, of_code);
4511 bits.write_bits(of_extra, of_code);
4512 }
4513 if ml_bits > 0 {
4514 println!("Writing ML extra: value={}, bits={}", ml_extra, ml_bits);
4515 bits.write_bits(ml_extra, ml_bits);
4516 }
4517 bits.write_bits(ml_state, ml_log);
4521 bits.write_bits(of_state, of_log);
4522 bits.write_bits(ll_state, ll_log);
4523
4524 println!("No FSE encode for single sequence (captured by init_state)");
4525
4526 let bitstream = bits.finish();
4527 println!("Bitstream ({} bytes): {:02x?}", bitstream.len(), bitstream);
4528
4529 println!("\nTotal bits written:");
4536 let total_extra = of_code as u32 + ml_bits as u32;
4537 let state_bits = ll_log + of_log + ml_log;
4538 println!(" OF extra: {} bits", of_code);
4539 println!(" ML extra: {} bits", ml_bits);
4540 println!(" FSE encode: 0 bits (none for single sequence)");
4541 println!(" Init states: {} bits", state_bits);
4542 println!(
4543 " Total: {} bits = {} bytes",
4544 total_extra + state_bits as u32,
4545 ((total_extra + state_bits as u32) + 7) / 8
4546 );
4547
4548 assert_eq!(
4550 bitstream.len(),
4551 3,
4552 "Bitstream should be exactly 3 bytes for 1 sequence, got {}",
4553 bitstream.len()
4554 );
4555
4556 println!("\n=== Comparing with reference ===");
4560 println!("Our bitstream: {:02x?}", bitstream);
4561 println!(
4562 "Our init states: LL={}, OF={}, ML={}",
4563 init_ll, init_of, init_ml
4564 );
4565
4566 let ll_sym = ll_table.decode(init_ll as usize).symbol;
4568 let of_sym = of_table.decode(init_of as usize).symbol;
4569 let ml_sym = ml_table.decode(init_ml as usize).symbol;
4570 println!(
4571 "Symbols at our states: LL={}, OF={}, ML={}",
4572 ll_sym, of_sym, ml_sym
4573 );
4574 println!(
4575 "Expected symbols: LL={}, OF={}, ML={}",
4576 ll_code, of_code, ml_code
4577 );
4578
4579 assert_eq!(
4581 ll_sym, ll_code,
4582 "LL init state {} decodes to {} instead of {}",
4583 init_ll, ll_sym, ll_code
4584 );
4585 assert_eq!(
4586 of_sym, of_code,
4587 "OF init state {} decodes to {} instead of {}",
4588 init_of, of_sym, of_code
4589 );
4590 assert_eq!(
4591 ml_sym, ml_code,
4592 "ML init state {} decodes to {} instead of {}",
4593 init_ml, ml_sym, ml_code
4594 );
4595
4596 println!("\n=== Decoding reference bitstream ===");
4598 let ref_bitstream = vec![0xfd, 0xe4, 0x88];
4599 use crate::fse::{BitReader, FseDecoder};
4600 let mut bits = BitReader::new(&ref_bitstream);
4601 bits.init_from_end().unwrap();
4602
4603 let mut ll_dec = FseDecoder::new(&ll_table);
4604 let mut of_dec = FseDecoder::new(&of_table);
4605 let mut ml_dec = FseDecoder::new(&ml_table);
4606
4607 ll_dec.init_state(&mut bits).unwrap();
4609 of_dec.init_state(&mut bits).unwrap();
4610 ml_dec.init_state(&mut bits).unwrap();
4611
4612 let ref_ll_state = ll_dec.state();
4613 let ref_of_state = of_dec.state();
4614 let ref_ml_state = ml_dec.state();
4615
4616 println!(
4617 "Reference init states: LL={}, OF={}, ML={}",
4618 ref_ll_state, ref_of_state, ref_ml_state
4619 );
4620
4621 let ref_ll_sym = ll_table.decode(ref_ll_state).symbol;
4623 let ref_of_sym = of_table.decode(ref_of_state).symbol;
4624 let ref_ml_sym = ml_table.decode(ref_ml_state).symbol;
4625 println!(
4626 "Reference symbols: LL={}, OF={}, ML={}",
4627 ref_ll_sym, ref_of_sym, ref_ml_sym
4628 );
4629
4630 let remaining_bits = bits.bits_remaining();
4632 println!("Remaining bits after init states: {}", remaining_bits);
4633
4634 let ll_extra = 0; let ml_extra = bits.read_bits(4).unwrap();
4639 let of_extra = bits.read_bits(2).unwrap();
4640 println!(
4641 "Reference extra bits: LL={}, ML={}, OF={}",
4642 ll_extra, ml_extra, of_extra
4643 );
4644
4645 println!("Expected extra bits: LL=0, ML=13, OF=0");
4647
4648 let ref_ml = 83 + ml_extra;
4651 println!("Reference match_length = 83 + {} = {}", ml_extra, ref_ml);
4652
4653 println!("OF code 2 = repeat offset 3 = initial value 8");
4660 println!("But OF has extra bits {}? That's confusing...", of_extra);
4661
4662 }
4667
4668 #[test]
4669 fn test_compare_with_reference_bitstream() {
4670 let data: Vec<u8> = b"ABCD".iter().cycle().take(100).copied().collect();
4673
4674 let ref_compressed = zstd::encode_all(data.as_slice(), 1).unwrap();
4676 println!(
4677 "Reference compressed ({} bytes): {:02x?}",
4678 ref_compressed.len(),
4679 ref_compressed
4680 );
4681
4682 let magic = u32::from_le_bytes([
4685 ref_compressed[0],
4686 ref_compressed[1],
4687 ref_compressed[2],
4688 ref_compressed[3],
4689 ]);
4690 println!("Magic: 0x{:08x}", magic);
4691
4692 let fhd = ref_compressed[4];
4693 println!("FHD: 0x{:02x}", fhd);
4694
4695 let content_size_flag = (fhd >> 6) & 0x03;
4697 let single_segment_flag = (fhd >> 5) & 0x01;
4698
4699 let window_desc_size = if single_segment_flag == 0 { 1 } else { 0 };
4701
4702 let content_size_bytes = match (content_size_flag, single_segment_flag) {
4704 (0, 1) => 1, (0, 0) => 0, (1, _) => 2,
4707 (2, _) => 4,
4708 (3, _) => 8,
4709 _ => 0,
4710 };
4711
4712 let frame_header_size = 1 + window_desc_size + content_size_bytes;
4713 println!(
4714 "Frame header: FHD=1 + Window_Desc={} + Content_Size={} = {} bytes",
4715 window_desc_size, content_size_bytes, frame_header_size
4716 );
4717
4718 let block_start = 4 + frame_header_size;
4719 let block_header = u32::from_le_bytes([
4720 ref_compressed[block_start],
4721 ref_compressed[block_start + 1],
4722 ref_compressed[block_start + 2],
4723 0,
4724 ]);
4725 let block_type = (block_header >> 1) & 0x03;
4726 let block_size = (block_header >> 3) as usize;
4727 println!("Block header: 0x{:06x}", block_header);
4728 println!("Block type: {} (0=raw, 1=rle, 2=compressed)", block_type);
4729 println!("Block size: {} bytes", block_size);
4730
4731 if block_type == 2 {
4732 let block_content_start = block_start + 3;
4734 let block_content =
4735 &ref_compressed[block_content_start..block_content_start + block_size];
4736 println!(
4737 "Block content ({} bytes): {:02x?}",
4738 block_content.len(),
4739 block_content
4740 );
4741
4742 let lit_header = block_content[0];
4744 let lit_type = lit_header & 0x03;
4745 println!("Literals header: 0x{:02x}, type={}", lit_header, lit_type);
4746
4747 let (lit_block_size, lit_header_size) = match lit_type {
4749 0 | 1 => {
4750 if lit_header < 128 {
4752 ((lit_header >> 3) as usize, 1)
4753 } else if (lit_header & 0x0C) == 0 {
4754 let sz = ((lit_header as usize) >> 4) + ((block_content[1] as usize) << 4);
4755 (sz, 2)
4756 } else {
4757 (
4758 ((lit_header as usize) >> 4)
4759 + ((block_content[1] as usize) << 4)
4760 + ((block_content[2] as usize) << 12),
4761 3,
4762 )
4763 }
4764 }
4765 _ => (0, 1), };
4767 println!(
4768 "Literals block: type={}, size={} bytes, header={} bytes",
4769 lit_type, lit_block_size, lit_header_size
4770 );
4771
4772 let seq_start = lit_header_size + if lit_type == 1 { 1 } else { lit_block_size };
4773 println!("Sequences start at offset: {}", seq_start);
4774
4775 if seq_start < block_content.len() {
4776 let seq_section = &block_content[seq_start..];
4777 println!(
4778 "Sequences section ({} bytes): {:02x?}",
4779 seq_section.len(),
4780 seq_section
4781 );
4782
4783 if !seq_section.is_empty() {
4784 let seq_count = seq_section[0];
4785 println!("Sequence count: {}", seq_count);
4786
4787 if seq_count > 0 && seq_section.len() > 1 {
4788 let mode = seq_section[1];
4789 println!("Mode byte: 0x{:02x}", mode);
4790
4791 let bitstream_start = if mode == 0 { 2 } else { 2 + 3 }; if bitstream_start < seq_section.len() {
4793 let bitstream = &seq_section[bitstream_start..];
4794 println!(
4795 "FSE bitstream ({} bytes): {:02x?}",
4796 bitstream.len(),
4797 bitstream
4798 );
4799 }
4800 }
4801 }
4802 }
4803 }
4804 }
4805}
4806
4807#[cfg(test)]
4808mod internal_roundtrip_tests {
4809 use super::*;
4810 use haagenti_core::{Compressor, Decompressor};
4811
4812 #[test]
4813 fn test_internal_roundtrip_500() {
4814 let data: Vec<u8> = b"ABCD".iter().cycle().take(500).copied().collect();
4816
4817 println!("=== Internal Roundtrip Test (500 bytes) ===");
4818 println!("Input: {} bytes", data.len());
4819
4820 let compressor = ZstdCompressor::new();
4822 let compressed = compressor.compress(&data).expect("compress failed");
4823 println!("Compressed: {} bytes", compressed.len());
4824 println!("Compressed bytes: {:02x?}", &compressed);
4825
4826 let decompressor = ZstdDecompressor::new();
4828 match decompressor.decompress(&compressed) {
4829 Ok(decompressed) => {
4830 println!("Decompressed: {} bytes", decompressed.len());
4831 if decompressed == data {
4832 println!("SUCCESS! Internal roundtrip works!");
4833 } else {
4834 println!("MISMATCH!");
4835 println!("First 20 original: {:?}", &data[..20]);
4836 println!(
4837 "First 20 decoded: {:?}",
4838 &decompressed[..20.min(decompressed.len())]
4839 );
4840 }
4841 assert_eq!(decompressed, data);
4842 }
4843 Err(e) => {
4844 println!("FAILED: Our decoder failed: {:?}", e);
4845 panic!("Internal roundtrip failed");
4846 }
4847 }
4848 }
4849
4850 #[test]
4851 fn test_debug_ml_table_symbols() {
4852 use crate::block::MATCH_LENGTH_BASELINE;
4853 use crate::fse::{FseTable, MATCH_LENGTH_ACCURACY_LOG, MATCH_LENGTH_DEFAULT_DISTRIBUTION};
4854
4855 let ml_table = FseTable::from_predefined(
4856 &MATCH_LENGTH_DEFAULT_DISTRIBUTION,
4857 MATCH_LENGTH_ACCURACY_LOG,
4858 )
4859 .unwrap();
4860
4861 println!("=== ML Table Symbols Debug ===");
4862
4863 let mut mismatches = 0;
4866 for state in 0..64 {
4867 let entry = ml_table.decode(state);
4868 let symbol = entry.symbol as usize;
4869
4870 if symbol < MATCH_LENGTH_BASELINE.len() {
4872 let (expected_bits, expected_base) = MATCH_LENGTH_BASELINE[symbol];
4873
4874 if entry.seq_base != expected_base || entry.seq_extra_bits != expected_bits {
4875 println!("MISMATCH State {}: symbol={}", state, symbol);
4876 println!(
4877 " Table: seq_base={}, seq_extra_bits={}",
4878 entry.seq_base, entry.seq_extra_bits
4879 );
4880 println!(
4881 " MATCH_LENGTH_BASELINE[{}]: baseline={}, bits={}",
4882 symbol, expected_base, expected_bits
4883 );
4884 mismatches += 1;
4885 }
4886 }
4887 }
4888
4889 println!("\nTotal mismatches: {}", mismatches);
4890
4891 for state in [19, 41, 42, 43, 44, 45, 62, 63] {
4893 let entry = ml_table.decode(state);
4894 println!(
4895 "State {}: symbol={}, seq_base={}, seq_extra_bits={}",
4896 state, entry.symbol, entry.seq_base, entry.seq_extra_bits
4897 );
4898 if (entry.symbol as usize) < MATCH_LENGTH_BASELINE.len() {
4899 let (bits, base) = MATCH_LENGTH_BASELINE[entry.symbol as usize];
4900 println!(" Expected: baseline={}, bits={}", base, bits);
4901 }
4902 }
4903
4904 let mut all_zero = true;
4906 for state in 0..64 {
4907 if ml_table.decode(state).symbol != 0 {
4908 all_zero = false;
4909 break;
4910 }
4911 }
4912
4913 assert!(!all_zero, "ML table has all symbol=0, which is wrong!");
4914 assert_eq!(
4915 mismatches, 0,
4916 "Found {} mismatches between table and MATCH_LENGTH_BASELINE",
4917 mismatches
4918 );
4919 }
4920}
4921
4922#[cfg(test)]
4923mod ref_decode_tests {
4924 use super::*;
4925 use haagenti_core::Decompressor;
4926
4927 #[test]
4928 fn test_trace_reference_bitstream() {
4929 use crate::block::{LITERAL_LENGTH_BASELINE, MATCH_LENGTH_BASELINE};
4930 use crate::fse::{
4931 BitReader, FseDecoder, FseTable, LITERAL_LENGTH_ACCURACY_LOG,
4932 LITERAL_LENGTH_DEFAULT_DISTRIBUTION, MATCH_LENGTH_ACCURACY_LOG,
4933 MATCH_LENGTH_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG, OFFSET_DEFAULT_DISTRIBUTION,
4934 };
4935
4936 let fse_bytes: [u8; 4] = [0xed, 0xab, 0x8e, 0x08];
4939
4940 println!("=== Trace Reference Bitstream ===");
4941 println!("Bytes: {:02x?}", fse_bytes);
4942
4943 let value = u32::from_le_bytes(fse_bytes);
4946 println!("As u32 LE: 0x{:08x} = {:032b}", value, value);
4947
4948 let sentinel_pos = 31 - value.leading_zeros();
4950 println!("Sentinel at bit {}", sentinel_pos);
4951
4952 let ll_state_bits = (value >> 21) & 0x3F; let of_state_bits = (value >> 16) & 0x1F; let ml_state_bits = (value >> 10) & 0x3F; println!("Manual extraction (assuming sentinel at 27):");
4971 println!(" LL bits 26-21: {:06b} = {}", ll_state_bits, ll_state_bits);
4972 println!(" OF bits 20-16: {:05b} = {}", of_state_bits, of_state_bits);
4973 println!(" ML bits 15-10: {:06b} = {}", ml_state_bits, ml_state_bits);
4974
4975 let ll_table = FseTable::from_predefined(
4977 &LITERAL_LENGTH_DEFAULT_DISTRIBUTION,
4978 LITERAL_LENGTH_ACCURACY_LOG,
4979 )
4980 .unwrap();
4981 let of_table =
4982 FseTable::from_predefined(&OFFSET_DEFAULT_DISTRIBUTION, OFFSET_ACCURACY_LOG).unwrap();
4983 let ml_table = FseTable::from_predefined(
4984 &MATCH_LENGTH_DEFAULT_DISTRIBUTION,
4985 MATCH_LENGTH_ACCURACY_LOG,
4986 )
4987 .unwrap();
4988
4989 let mut ll_decoder = FseDecoder::new(&ll_table);
4991 let mut of_decoder = FseDecoder::new(&of_table);
4992 let mut ml_decoder = FseDecoder::new(&ml_table);
4993
4994 let mut bits = BitReader::new(&fse_bytes);
4996 bits.init_from_end().expect("init_from_end");
4997
4998 ll_decoder.init_state(&mut bits).expect("ll init");
5000 of_decoder.init_state(&mut bits).expect("of init");
5001 ml_decoder.init_state(&mut bits).expect("ml init");
5002
5003 let ll_state = ll_decoder.state();
5004 let of_state = of_decoder.state();
5005 let ml_state = ml_decoder.state();
5006 println!(
5007 "Initial states: LL={}, OF={}, ML={}",
5008 ll_state, of_state, ml_state
5009 );
5010
5011 let ll_code = ll_decoder.peek_symbol();
5013 let of_code = of_decoder.peek_symbol();
5014 let ml_code = ml_decoder.peek_symbol();
5015 println!(
5016 "Symbols: LL_code={}, OF_code={}, ML_code={}",
5017 ll_code, of_code, ml_code
5018 );
5019
5020 let ll_bits = if ll_code < LITERAL_LENGTH_BASELINE.len() as u8 {
5022 LITERAL_LENGTH_BASELINE[ll_code as usize].0
5023 } else {
5024 0
5025 };
5026 let ml_bits = if ml_code < MATCH_LENGTH_BASELINE.len() as u8 {
5027 MATCH_LENGTH_BASELINE[ml_code as usize].0
5028 } else {
5029 0
5030 };
5031 let of_bits = if of_code < 32 { of_code } else { 0 }; println!(
5033 "Extra bits needed: LL={}, ML={}, OF={}",
5034 ll_bits, ml_bits, of_bits
5035 );
5036
5037 bits.switch_to_lsb_mode().expect("switch");
5039
5040 let ll_extra = if ll_bits > 0 {
5042 bits.read_bits(ll_bits as usize).expect("ll extra")
5043 } else {
5044 0
5045 };
5046 let ml_extra = if ml_bits > 0 {
5047 bits.read_bits(ml_bits as usize).expect("ml extra")
5048 } else {
5049 0
5050 };
5051 let of_extra = if of_bits > 0 {
5052 bits.read_bits(of_bits as usize).expect("of extra")
5053 } else {
5054 0
5055 };
5056 println!(
5057 "Extra bits values: LL={}, ML={}, OF={}",
5058 ll_extra, ml_extra, of_extra
5059 );
5060
5061 let ll_baseline = if ll_code < LITERAL_LENGTH_BASELINE.len() as u8 {
5063 LITERAL_LENGTH_BASELINE[ll_code as usize].1
5064 } else {
5065 0
5066 };
5067 let ml_baseline = if ml_code < MATCH_LENGTH_BASELINE.len() as u8 {
5068 MATCH_LENGTH_BASELINE[ml_code as usize].1
5069 } else {
5070 0
5071 };
5072
5073 let literal_length = ll_baseline + ll_extra;
5074 let match_length = ml_baseline + ml_extra;
5075 let offset_value = (1u32 << of_code) + of_extra;
5077
5078 println!(
5079 "Decoded: literal_length={}, match_length={}, offset_value={}",
5080 literal_length, match_length, offset_value
5081 );
5082
5083 println!(
5086 "Total output would be: {} literals + {} match = {}",
5087 literal_length,
5088 match_length,
5089 literal_length + match_length
5090 );
5091
5092 assert_eq!(literal_length, 4, "literal_length");
5094 assert_eq!(match_length, 496, "match_length should be 496");
5095 }
5096
5097 #[test]
5098 fn test_decode_reference_500() {
5099 let ref_compressed: [u8; 20] = [
5103 0x28, 0xb5, 0x2f, 0xfd, 0x00, 0x48, 0x5d, 0x00, 0x00, 0x20, 0x41, 0x42, 0x43, 0x44, 0x01, 0x00, 0xed, 0xab, 0x8e, 0x08, ];
5112
5113 println!("=== Test Decode Reference 500 ===");
5114 println!("Reference compressed: {} bytes", ref_compressed.len());
5115 println!("Bytes: {:02x?}", ref_compressed);
5116
5117 let decompressor = ZstdDecompressor::new();
5118 match decompressor.decompress(&ref_compressed) {
5119 Ok(decompressed) => {
5120 let expected = "ABCD".repeat(125);
5121 println!("Decompressed: {} bytes", decompressed.len());
5122 if decompressed == expected.as_bytes() {
5123 println!("SUCCESS! Reference decompression matches!");
5124 } else {
5125 println!("MISMATCH!");
5126 println!("First 20 expected: {:?}", &expected.as_bytes()[..20]);
5127 println!(
5128 "First 20 got: {:?}",
5129 &decompressed[..20.min(decompressed.len())]
5130 );
5131 }
5132 assert_eq!(decompressed, expected.as_bytes());
5133 }
5134 Err(e) => {
5135 println!("FAILED: {:?}", e);
5136 panic!("Failed to decompress reference");
5137 }
5138 }
5139 }
5140}
5141
5142#[cfg(test)]
5147mod throughput_tests {
5148 use super::*;
5149 use std::time::Instant;
5150
5151 fn generate_compressible_data(size: usize) -> Vec<u8> {
5152 let mut data = Vec::with_capacity(size);
5153 let patterns = [
5154 b"The quick brown fox jumps over the lazy dog. ".as_slice(),
5155 b"Lorem ipsum dolor sit amet, consectetur adipiscing elit. ".as_slice(),
5156 b"Pack my box with five dozen liquor jugs. ".as_slice(),
5157 ];
5158
5159 let mut pattern_idx = 0;
5160 while data.len() < size {
5161 let pattern = patterns[pattern_idx % patterns.len()];
5162 let remaining = size - data.len();
5163 data.extend_from_slice(&pattern[..pattern.len().min(remaining)]);
5164 pattern_idx += 1;
5165 }
5166 data
5167 }
5168
5169 #[test]
5170 fn test_64kb_compression_throughput() {
5171 let data = generate_compressible_data(64 * 1024);
5172 let compressor = ZstdCompressor::new();
5173
5174 let start = Instant::now();
5175 let iterations = 100;
5176 for _ in 0..iterations {
5177 let _ = compressor.compress(&data).unwrap();
5178 }
5179 let elapsed = start.elapsed();
5180
5181 let throughput_mbs =
5182 (iterations as f64 * data.len() as f64) / elapsed.as_secs_f64() / 1_000_000.0;
5183
5184 assert!(
5186 throughput_mbs > 0.0,
5187 "64KB throughput: {:.1} MB/s",
5188 throughput_mbs
5189 );
5190
5191 println!("64KB compression throughput: {:.1} MB/s", throughput_mbs);
5193 }
5194
5195 #[test]
5196 fn test_1mb_compression_throughput() {
5197 let data = generate_compressible_data(1024 * 1024);
5198 let compressor = ZstdCompressor::new();
5199
5200 let start = Instant::now();
5201 let iterations = 20;
5202 for _ in 0..iterations {
5203 let _ = compressor.compress(&data).unwrap();
5204 }
5205 let elapsed = start.elapsed();
5206
5207 let throughput_mbs =
5208 (iterations as f64 * data.len() as f64) / elapsed.as_secs_f64() / 1_000_000.0;
5209
5210 assert!(
5211 throughput_mbs > 0.0,
5212 "1MB throughput: {:.1} MB/s",
5213 throughput_mbs
5214 );
5215
5216 println!("1MB compression throughput: {:.1} MB/s", throughput_mbs);
5217 }
5218
5219 #[test]
5220 fn test_decompression_throughput() {
5221 let data = generate_compressible_data(1024 * 1024);
5222 let compressed = ZstdCompressor::new().compress(&data).unwrap();
5223 let decompressor = ZstdDecompressor::new();
5224
5225 let start = Instant::now();
5226 let iterations = 50;
5227 for _ in 0..iterations {
5228 let _ = decompressor.decompress(&compressed).unwrap();
5229 }
5230 let elapsed = start.elapsed();
5231
5232 let throughput_mbs =
5233 (iterations as f64 * data.len() as f64) / elapsed.as_secs_f64() / 1_000_000.0;
5234
5235 assert!(
5237 throughput_mbs > 0.0,
5238 "Decompression throughput: {:.1} MB/s",
5239 throughput_mbs
5240 );
5241
5242 println!("Decompression throughput: {:.1} MB/s", throughput_mbs);
5243 }
5244
5245 #[test]
5246 fn test_adaptive_search_depth_scaling() {
5247 let compressor = ZstdCompressor::new();
5248
5249 let sizes = [4096usize, 16384, 65536, 262144];
5250 let mut times_per_byte = Vec::new();
5251
5252 for &size in &sizes {
5253 let data = generate_compressible_data(size);
5254
5255 let start = Instant::now();
5256 let iterations = (1_000_000 / size).max(1);
5257 for _ in 0..iterations {
5258 let _ = compressor.compress(&data).unwrap();
5259 }
5260 let elapsed = start.elapsed();
5261
5262 let ns_per_byte = elapsed.as_nanos() as f64 / (iterations * size) as f64;
5263 times_per_byte.push((size, ns_per_byte));
5264 }
5265
5266 let small_time = times_per_byte[0].1;
5268 let large_time = times_per_byte[3].1;
5269
5270 assert!(
5273 large_time < small_time * 5.0 || large_time < 100.0, "Large data too slow: {:.2} ns/byte vs {:.2} ns/byte for small",
5275 large_time,
5276 small_time
5277 );
5278 }
5279
5280 #[test]
5281 fn test_throughput_vs_level_tradeoff() {
5282 let data = generate_compressible_data(256 * 1024);
5283
5284 let levels = [
5285 CompressionLevel::Fast,
5286 CompressionLevel::Default,
5287 CompressionLevel::Best,
5288 ];
5289
5290 let mut results: Vec<(CompressionLevel, f64, usize)> = Vec::new();
5291
5292 for level in levels {
5293 let compressor = ZstdCompressor::with_level(level);
5294 let iterations = 10;
5295
5296 let start = Instant::now();
5297 let mut compressed_size = 0;
5298 for _ in 0..iterations {
5299 let c = compressor.compress(&data).unwrap();
5300 compressed_size = c.len();
5301 }
5302 let elapsed = start.elapsed();
5303
5304 let throughput_mbs =
5305 (iterations as f64 * data.len() as f64) / elapsed.as_secs_f64() / 1_000_000.0;
5306
5307 results.push((level, throughput_mbs, compressed_size));
5308 }
5309
5310 let fast_throughput = results[0].1;
5312 let best_throughput = results[2].1;
5313
5314 assert!(fast_throughput > 0.0, "Fast throughput should be positive");
5316 assert!(best_throughput > 0.0, "Best throughput should be positive");
5317
5318 let fast_size = results[0].2;
5320 let best_size = results[2].2;
5321 assert!(
5322 best_size <= fast_size,
5323 "Best should compress at least as well: best={} fast={}",
5324 best_size,
5325 fast_size
5326 );
5327 }
5328
5329 #[test]
5330 fn test_compression_efficiency_binary_vs_text() {
5331 let text_data = generate_compressible_data(64 * 1024);
5332
5333 let binary_data: Vec<u8> = (0u64..64 * 1024)
5335 .map(|i| ((i.wrapping_mul(17).wrapping_add(i.wrapping_mul(i))) % 256) as u8)
5336 .collect();
5337
5338 let compressor = ZstdCompressor::new();
5339
5340 let text_compressed = compressor.compress(&text_data).unwrap();
5341 let binary_compressed = compressor.compress(&binary_data).unwrap();
5342
5343 let text_ratio = text_data.len() as f64 / text_compressed.len() as f64;
5344 let binary_ratio = binary_data.len() as f64 / binary_compressed.len() as f64;
5345
5346 assert!(
5348 text_ratio > binary_ratio,
5349 "Text ratio {:.2}x should be better than binary {:.2}x",
5350 text_ratio,
5351 binary_ratio
5352 );
5353 }
5354
5355 #[test]
5356 fn test_roundtrip_preserves_data_large() {
5357 let data = generate_compressible_data(512 * 1024);
5359
5360 let compressor = ZstdCompressor::new();
5361 let decompressor = ZstdDecompressor::new();
5362
5363 let compressed = compressor.compress(&data).unwrap();
5364 let decompressed = decompressor.decompress(&compressed).unwrap();
5365
5366 assert_eq!(
5367 data.len(),
5368 decompressed.len(),
5369 "Large data roundtrip size mismatch"
5370 );
5371 assert_eq!(data, decompressed, "Large data roundtrip content mismatch");
5372 }
5373
5374 #[test]
5375 fn test_memory_efficiency_large_data() {
5376 let data = generate_compressible_data(1024 * 1024); let compressor = ZstdCompressor::new();
5380 let compressed = compressor.compress(&data).unwrap();
5381
5382 let ratio = data.len() as f64 / compressed.len() as f64;
5384 assert!(
5385 ratio > 1.5,
5386 "1MB text should compress at least 1.5x, got {:.2}x",
5387 ratio
5388 );
5389
5390 let decompressor = ZstdDecompressor::new();
5392 let decompressed = decompressor.decompress(&compressed).unwrap();
5393 assert_eq!(data, decompressed);
5394 }
5395}