1use serde::{Deserialize, Serialize};
32use std::io::{self, Write};
33use std::time::Instant;
34
35#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
37pub enum Codec {
38 None,
40 Gzip,
42 #[cfg(feature = "lz4")]
44 Lz4,
45 #[cfg(feature = "zstd")]
47 Zstd,
48}
49
50impl Codec {
51 fn id(&self) -> u8 {
53 match self {
54 Codec::None => 0,
55 Codec::Gzip => 1,
56 #[cfg(feature = "lz4")]
57 Codec::Lz4 => 2,
58 #[cfg(feature = "zstd")]
59 Codec::Zstd => 3,
60 }
61 }
62
63 fn from_id(id: u8) -> Result<Self, CompressionError> {
65 match id {
66 0 => Ok(Codec::None),
67 1 => Ok(Codec::Gzip),
68 #[cfg(feature = "lz4")]
69 2 => Ok(Codec::Lz4),
70 #[cfg(not(feature = "lz4"))]
71 2 => Err(CompressionError::UnsupportedCodec(
72 "LZ4 feature not enabled".to_string(),
73 )),
74 #[cfg(feature = "zstd")]
75 3 => Ok(Codec::Zstd),
76 #[cfg(not(feature = "zstd"))]
77 3 => Err(CompressionError::UnsupportedCodec(
78 "Zstd feature not enabled".to_string(),
79 )),
80 _ => Err(CompressionError::InvalidHeader(format!(
81 "Unknown codec ID: {}",
82 id
83 ))),
84 }
85 }
86
87 pub fn name(&self) -> &'static str {
89 match self {
90 Codec::None => "None",
91 Codec::Gzip => "Gzip",
92 #[cfg(feature = "lz4")]
93 Codec::Lz4 => "LZ4",
94 #[cfg(feature = "zstd")]
95 Codec::Zstd => "Zstd",
96 }
97 }
98}
99
100#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct CompressionStats {
103 pub original_size: usize,
105 pub compressed_size: usize,
107 pub ratio: f64,
109 pub codec: Codec,
111 pub duration_ms: f64,
113}
114
115impl CompressionStats {
116 pub fn space_saved(&self) -> i64 {
118 self.original_size as i64 - self.compressed_size as i64
119 }
120
121 pub fn space_saved_percent(&self) -> f64 {
123 if self.original_size == 0 {
124 0.0
125 } else {
126 (self.space_saved() as f64 / self.original_size as f64) * 100.0
127 }
128 }
129
130 pub fn throughput_mbps(&self) -> f64 {
132 if self.duration_ms == 0.0 {
133 0.0
134 } else {
135 (self.original_size as f64 / 1_000_000.0) / (self.duration_ms / 1000.0)
136 }
137 }
138}
139
140#[derive(Debug, thiserror::Error)]
142pub enum CompressionError {
143 #[error("IO error: {0}")]
144 Io(#[from] io::Error),
145
146 #[error("Invalid header: {0}")]
147 InvalidHeader(String),
148
149 #[error("Unsupported codec: {0}")]
150 UnsupportedCodec(String),
151
152 #[error("Compression failed: {0}")]
153 CompressionFailed(String),
154
155 #[error("Decompression failed: {0}")]
156 DecompressionFailed(String),
157}
158
159const HEADER_SIZE: usize = 4;
161const VERSION: u8 = 1;
162
163fn create_header(codec: Codec) -> [u8; HEADER_SIZE] {
165 [codec.id(), VERSION, 0, 0]
166}
167
168fn parse_header(data: &[u8]) -> Result<(Codec, usize), CompressionError> {
170 if data.len() < HEADER_SIZE {
171 return Err(CompressionError::InvalidHeader(format!(
172 "Data too small: {} bytes",
173 data.len()
174 )));
175 }
176
177 let codec = Codec::from_id(data[0])?;
178 let version = data[1];
179
180 if version != VERSION {
181 return Err(CompressionError::InvalidHeader(format!(
182 "Unsupported version: {}",
183 version
184 )));
185 }
186
187 Ok((codec, HEADER_SIZE))
188}
189
190pub fn best_codec() -> Codec {
203 #[cfg(feature = "lz4")]
204 {
205 Codec::Lz4
206 }
207
208 #[cfg(all(feature = "zstd", not(feature = "lz4")))]
209 {
210 return Codec::Zstd;
211 }
212
213 #[cfg(not(any(feature = "lz4", feature = "zstd")))]
214 {
215 Codec::Gzip
216 }
217}
218
219pub fn compress(data: &[u8]) -> Result<(Vec<u8>, CompressionStats), CompressionError> {
234 compress_with(data, best_codec())
235}
236
237pub fn compress_with(
249 data: &[u8],
250 codec: Codec,
251) -> Result<(Vec<u8>, CompressionStats), CompressionError> {
252 let start = Instant::now();
253 let original_size = data.len();
254
255 let header = create_header(codec);
256 let mut compressed = Vec::with_capacity(HEADER_SIZE + data.len());
257 compressed.extend_from_slice(&header);
258
259 match codec {
260 Codec::None => {
261 compressed.extend_from_slice(data);
262 }
263 Codec::Gzip => {
264 compress_gzip(data, &mut compressed)?;
265 }
266 #[cfg(feature = "lz4")]
267 Codec::Lz4 => {
268 compress_lz4(data, &mut compressed)?;
269 }
270 #[cfg(feature = "zstd")]
271 Codec::Zstd => {
272 compress_zstd(data, &mut compressed)?;
273 }
274 }
275
276 let duration_ms = start.elapsed().as_secs_f64() * 1000.0;
277 let compressed_size = compressed.len();
278 let ratio = if compressed_size > 0 {
279 original_size as f64 / compressed_size as f64
280 } else {
281 0.0
282 };
283
284 let stats = CompressionStats {
285 original_size,
286 compressed_size,
287 ratio,
288 codec,
289 duration_ms,
290 };
291
292 Ok((compressed, stats))
293}
294
295pub fn decompress(data: &[u8]) -> Result<Vec<u8>, CompressionError> {
308 let (codec, offset) = parse_header(data)?;
309 let compressed_data = &data[offset..];
310
311 match codec {
312 Codec::None => Ok(compressed_data.to_vec()),
313 Codec::Gzip => decompress_gzip(compressed_data),
314 #[cfg(feature = "lz4")]
315 Codec::Lz4 => decompress_lz4(compressed_data),
316 #[cfg(feature = "zstd")]
317 Codec::Zstd => decompress_zstd(compressed_data),
318 }
319}
320
321fn compress_gzip(data: &[u8], output: &mut Vec<u8>) -> Result<(), CompressionError> {
327 use flate2::write::GzEncoder;
328 use flate2::Compression;
329
330 let mut encoder = GzEncoder::new(output, Compression::default());
331 encoder.write_all(data)?;
332 encoder.finish()?;
333 Ok(())
334}
335
336fn decompress_gzip(data: &[u8]) -> Result<Vec<u8>, CompressionError> {
338 use flate2::read::GzDecoder;
339 use std::io::Read;
340
341 let mut decoder = GzDecoder::new(data);
342 let mut result = Vec::new();
343 decoder
344 .read_to_end(&mut result)
345 .map_err(|e| CompressionError::DecompressionFailed(e.to_string()))?;
346 Ok(result)
347}
348
349#[cfg(feature = "lz4")]
354fn compress_lz4(data: &[u8], output: &mut Vec<u8>) -> Result<(), CompressionError> {
355 let original_size = data.len() as u32;
357 output.extend_from_slice(&original_size.to_le_bytes());
358
359 let compressed = lz4::block::compress(
360 data,
361 Some(lz4::block::CompressionMode::HIGHCOMPRESSION(9)),
362 false,
363 )
364 .map_err(|e| CompressionError::CompressionFailed(e.to_string()))?;
365 output.extend_from_slice(&compressed);
366 Ok(())
367}
368
369#[cfg(feature = "lz4")]
373fn decompress_lz4(data: &[u8]) -> Result<Vec<u8>, CompressionError> {
374 if data.len() < 4 {
375 return Err(CompressionError::DecompressionFailed(
376 "LZ4 data too small: missing size header".to_string(),
377 ));
378 }
379
380 let size_bytes = [data[0], data[1], data[2], data[3]];
382 let original_size = u32::from_le_bytes(size_bytes) as usize;
383
384 let compressed_data = &data[4..];
386 lz4::block::decompress(compressed_data, Some(original_size as i32))
387 .map_err(|e| CompressionError::DecompressionFailed(e.to_string()))
388}
389
390#[cfg(feature = "zstd")]
392fn compress_zstd(data: &[u8], output: &mut Vec<u8>) -> Result<(), CompressionError> {
393 let compressed = zstd::encode_all(data, 3)
394 .map_err(|e| CompressionError::CompressionFailed(e.to_string()))?;
395 output.extend_from_slice(&compressed);
396 Ok(())
397}
398
399#[cfg(feature = "zstd")]
401fn decompress_zstd(data: &[u8]) -> Result<Vec<u8>, CompressionError> {
402 zstd::decode_all(data).map_err(|e| CompressionError::DecompressionFailed(e.to_string()))
403}
404
405#[cfg(test)]
410mod tests {
411 use super::*;
412
413 fn generate_test_data(size: usize, compressibility: Compressibility) -> Vec<u8> {
415 match compressibility {
416 Compressibility::Random => {
417 (0..size).map(|i| (i * 7 + 13) as u8).collect()
419 }
420 Compressibility::Repeated => {
421 b"Hello, World! ".repeat(size / 14 + 1)[..size].to_vec()
423 }
424 Compressibility::Structured => {
425 let json =
427 r#"{"id": "doc-001", "content": "This is a test document", "score": 0.95}"#;
428 json.repeat(size / json.len() + 1).as_bytes()[..size].to_vec()
429 }
430 }
431 }
432
433 enum Compressibility {
434 Random,
435 Repeated,
436 Structured,
437 }
438
439 #[test]
440 fn test_codec_id_roundtrip() {
441 for &id in &[0u8, 1u8] {
443 let codec = Codec::from_id(id).unwrap();
444 assert_eq!(codec.id(), id);
445 }
446
447 #[cfg(feature = "lz4")]
448 {
449 let codec = Codec::from_id(2).unwrap();
450 assert_eq!(codec.id(), 2);
451 }
452
453 #[cfg(feature = "zstd")]
454 {
455 let codec = Codec::from_id(3).unwrap();
456 assert_eq!(codec.id(), 3);
457 }
458 }
459
460 #[test]
461 fn test_invalid_codec_id() {
462 let result = Codec::from_id(99);
463 assert!(result.is_err());
464 }
465
466 #[test]
467 fn test_best_codec_available() {
468 let codec = best_codec();
469 assert!(!codec.name().is_empty());
471
472 #[cfg(feature = "lz4")]
473 assert_eq!(codec, Codec::Lz4);
474
475 #[cfg(all(feature = "zstd", not(feature = "lz4")))]
476 assert_eq!(codec, Codec::Zstd);
477
478 #[cfg(not(any(feature = "lz4", feature = "zstd")))]
479 assert_eq!(codec, Codec::Gzip);
480 }
481
482 #[test]
483 fn test_compression_none() {
484 let data = b"Hello, World!";
485 let (compressed, stats) = compress_with(data, Codec::None).unwrap();
486
487 assert_eq!(stats.codec, Codec::None);
488 assert_eq!(stats.original_size, data.len());
489 assert_eq!(stats.compressed_size, data.len() + HEADER_SIZE);
490
491 let decompressed = decompress(&compressed).unwrap();
492 assert_eq!(data.as_slice(), decompressed.as_slice());
493 }
494
495 #[test]
496 fn test_compression_gzip() {
497 let data = generate_test_data(1000, Compressibility::Repeated);
498 let (compressed, stats) = compress_with(&data, Codec::Gzip).unwrap();
499
500 assert_eq!(stats.codec, Codec::Gzip);
501 assert_eq!(stats.original_size, data.len());
502 assert!(
503 stats.compressed_size < data.len(),
504 "Gzip should compress repeated data"
505 );
506 assert!(stats.ratio > 1.0);
507 assert!(stats.duration_ms >= 0.0);
508
509 let decompressed = decompress(&compressed).unwrap();
510 assert_eq!(data, decompressed);
511 }
512
513 #[cfg(feature = "lz4")]
514 #[test]
515 fn test_compression_lz4() {
516 let data = generate_test_data(1000, Compressibility::Repeated);
517 let (compressed, stats) = compress_with(&data, Codec::Lz4).unwrap();
518
519 assert_eq!(stats.codec, Codec::Lz4);
520 assert_eq!(stats.original_size, data.len());
521 assert!(
522 stats.compressed_size < data.len(),
523 "LZ4 should compress repeated data"
524 );
525 assert!(stats.ratio > 1.0);
526
527 let decompressed = decompress(&compressed).unwrap();
528 assert_eq!(data, decompressed);
529 }
530
531 #[cfg(feature = "zstd")]
532 #[test]
533 fn test_compression_zstd() {
534 let data = generate_test_data(1000, Compressibility::Repeated);
535 let (compressed, stats) = compress_with(&data, Codec::Zstd).unwrap();
536
537 assert_eq!(stats.codec, Codec::Zstd);
538 assert_eq!(stats.original_size, data.len());
539 assert!(
540 stats.compressed_size < data.len(),
541 "Zstd should compress repeated data"
542 );
543 assert!(stats.ratio > 1.0);
544
545 let decompressed = decompress(&compressed).unwrap();
546 assert_eq!(data, decompressed);
547 }
548
549 #[test]
550 fn test_roundtrip_best_codec() {
551 let data = generate_test_data(5000, Compressibility::Structured);
552 let (compressed, stats) = compress(&data).unwrap();
553
554 println!("Best codec: {:?}, ratio: {:.2}x", stats.codec, stats.ratio);
555
556 let decompressed = decompress(&compressed).unwrap();
557 assert_eq!(data, decompressed);
558 }
559
560 #[test]
561 fn test_header_format() {
562 let data = b"test data";
563 let (compressed, _) = compress_with(data, Codec::Gzip).unwrap();
564
565 assert!(compressed.len() >= HEADER_SIZE);
567 assert_eq!(compressed[0], Codec::Gzip.id());
568 assert_eq!(compressed[1], VERSION);
569 }
570
571 #[test]
572 fn test_invalid_header_too_small() {
573 let data = &[1, 2]; let result = decompress(data);
575 assert!(result.is_err());
576 assert!(matches!(
577 result.unwrap_err(),
578 CompressionError::InvalidHeader(_)
579 ));
580 }
581
582 #[test]
583 fn test_invalid_header_wrong_version() {
584 let mut data = vec![1, 99, 0, 0]; data.extend_from_slice(b"some data");
586 let result = decompress(&data);
587 assert!(result.is_err());
588 }
589
590 #[test]
591 fn test_compression_stats() {
592 let data = generate_test_data(10000, Compressibility::Repeated);
593 let (_, stats) = compress(&data).unwrap();
594
595 assert_eq!(stats.original_size, 10000);
596 assert!(stats.compressed_size > 0);
597 assert!(stats.ratio > 0.0);
598 assert!(stats.duration_ms >= 0.0);
599
600 let saved = stats.space_saved();
602 let saved_percent = stats.space_saved_percent();
603 let throughput = stats.throughput_mbps();
604
605 assert!(saved > 0, "Should save space on repeated data");
606 assert!(saved_percent > 0.0 && saved_percent < 100.0);
607 assert!(throughput > 0.0);
608 }
609
610 #[test]
611 fn test_empty_data() {
612 let data = b"";
613 let (compressed, stats) = compress(data).unwrap();
614 assert_eq!(stats.original_size, 0);
615
616 let decompressed = decompress(&compressed).unwrap();
617 assert_eq!(data.as_slice(), decompressed.as_slice());
618 }
619
620 #[test]
621 fn test_large_data() {
622 let data = generate_test_data(1_000_000, Compressibility::Structured);
624 let (compressed, stats) = compress(&data).unwrap();
625
626 println!(
627 "Large data compression: {:.2} MB -> {:.2} MB ({:.2}x, {:.2} MB/s)",
628 stats.original_size as f64 / 1_000_000.0,
629 stats.compressed_size as f64 / 1_000_000.0,
630 stats.ratio,
631 stats.throughput_mbps()
632 );
633
634 assert!(stats.ratio > 1.0, "Should compress structured data");
635
636 let decompressed = decompress(&compressed).unwrap();
637 assert_eq!(data.len(), decompressed.len());
638 assert_eq!(data, decompressed);
639 }
640
641 #[test]
642 fn test_random_data_incompressible() {
643 let data = generate_test_data(1000, Compressibility::Random);
644 let (compressed, stats) = compress(&data).unwrap();
645
646 println!("Random data ratio: {:.2}x", stats.ratio);
650 assert!(
651 stats.ratio < 10.0,
652 "Random data should not compress as well as structured data"
653 );
654
655 let decompressed = decompress(&compressed).unwrap();
657 assert_eq!(data, decompressed);
658 }
659
660 #[test]
661 fn test_embedding_vectors() {
662 let embeddings: Vec<f32> = (0..384).map(|i| (i as f32) * 0.001).collect();
664 let data: Vec<u8> = embeddings.iter().flat_map(|f| f.to_le_bytes()).collect();
665
666 let (compressed, stats) = compress(&data).unwrap();
667
668 println!(
669 "Embedding compression: {} -> {} bytes ({:.2}x)",
670 stats.original_size, stats.compressed_size, stats.ratio
671 );
672
673 let decompressed = decompress(&compressed).unwrap();
674 assert_eq!(data, decompressed);
675
676 let reconstructed: Vec<f32> = decompressed
678 .chunks_exact(4)
679 .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
680 .collect();
681 assert_eq!(embeddings, reconstructed);
682 }
683
684 #[test]
685 fn test_codec_names() {
686 assert_eq!(Codec::None.name(), "None");
687 assert_eq!(Codec::Gzip.name(), "Gzip");
688
689 #[cfg(feature = "lz4")]
690 assert_eq!(Codec::Lz4.name(), "LZ4");
691
692 #[cfg(feature = "zstd")]
693 assert_eq!(Codec::Zstd.name(), "Zstd");
694 }
695
696 #[test]
701 #[ignore] fn bench_compression_comparison() {
703 let test_cases = vec![
704 ("Small Random", 1_000, Compressibility::Random),
705 ("Small Repeated", 1_000, Compressibility::Repeated),
706 ("Small Structured", 1_000, Compressibility::Structured),
707 ("Large Random", 100_000, Compressibility::Random),
708 ("Large Repeated", 100_000, Compressibility::Repeated),
709 ("Large Structured", 100_000, Compressibility::Structured),
710 ];
711
712 println!(
713 "\n{:<20} {:<10} {:<12} {:<12} {:<10} {:<12}",
714 "Test Case", "Codec", "Original", "Compressed", "Ratio", "Speed (MB/s)"
715 );
716 println!("{}", "=".repeat(85));
717
718 for (name, size, comp_type) in test_cases {
719 let data = generate_test_data(size, comp_type);
720
721 let codecs = vec![
723 Codec::None,
724 Codec::Gzip,
725 #[cfg(feature = "lz4")]
726 Codec::Lz4,
727 #[cfg(feature = "zstd")]
728 Codec::Zstd,
729 ];
730
731 for codec in codecs {
732 let (_, stats) = compress_with(&data, codec).unwrap();
733 println!(
734 "{:<20} {:<10} {:<12} {:<12} {:<10.2} {:<12.2}",
735 name,
736 codec.name(),
737 stats.original_size,
738 stats.compressed_size,
739 stats.ratio,
740 stats.throughput_mbps()
741 );
742 }
743 }
744 }
745
746 #[test]
747 #[ignore]
748 fn bench_decompression_speed() {
749 let data = generate_test_data(1_000_000, Compressibility::Structured);
750 let (compressed, comp_stats) = compress(&data).unwrap();
751
752 println!("\nDecompression benchmark:");
753 println!(
754 "Codec: {:?}, Compressed size: {} bytes",
755 comp_stats.codec,
756 compressed.len()
757 );
758
759 let iterations = 10;
760 let start = Instant::now();
761
762 for _ in 0..iterations {
763 let _ = decompress(&compressed).unwrap();
764 }
765
766 let elapsed = start.elapsed().as_secs_f64();
767 let throughput = (data.len() * iterations) as f64 / 1_000_000.0 / elapsed;
768
769 println!("Decompression throughput: {:.2} MB/s", throughput);
770 }
771}