1use std::io::{self, Cursor, Read, Write};
10
11use crate::dcx::{DcxHeader, FormatHint, Mode};
12use crate::entropy::arithmetic::{ArithmeticDecoder, ArithmeticEncoder};
13use crate::format::transform::TransformChain;
14use crate::format::{detect_format, preprocess, reverse_preprocess};
15use crate::mixer::MetaMixer;
16use crate::model::gru_model::GruModel;
17use crate::model::{CMConfig, CMEngine};
18
19fn adaptive_fast_level(data_size: usize, level_override: Option<i32>) -> i32 {
25 if let Some(level) = level_override {
26 return level; }
28 match data_size {
29 0..=1_048_576 => 19, 1_048_577..=10_485_760 => 13, _ => 9, }
33}
34
35const DICT_MIN_DATA_SIZE: usize = 8192;
40
41fn dict_chunk_size(data_len: usize) -> usize {
47 if data_len > 4_194_304 {
48 131_072 } else if data_len > 1_048_576 {
50 65_536 } else if data_len > 262_144 {
52 32_768 } else {
54 16_384 }
56}
57
58fn dict_max_size(data_len: usize) -> usize {
62 if data_len > 4_194_304 {
63 16_384 } else if data_len > 1_048_576 {
65 8_192 } else {
67 4_096 }
69}
70
71fn generate_training_samples(data: &[u8], chunk_size: usize) -> Vec<&[u8]> {
77 let col_chunks: Vec<&[u8]> = data.split(|&b| b == 0x00).collect();
79 if col_chunks.len() >= 5 {
80 return col_chunks.into_iter().filter(|c| !c.is_empty()).collect();
82 }
83
84 split_into_chunks(data, chunk_size)
86}
87
88fn split_into_chunks(data: &[u8], chunk_size: usize) -> Vec<&[u8]> {
91 let mut chunks = Vec::new();
92 let mut offset = 0;
93 while offset < data.len() {
94 let end = (offset + chunk_size).min(data.len());
95 chunks.push(&data[offset..end]);
96 offset = end;
97 }
98 chunks
99}
100
101fn try_dict_compress(data: &[u8], level: i32, plain_size: usize) -> Option<Vec<u8>> {
110 let chunk_size = dict_chunk_size(data.len());
111
112 let training_samples = generate_training_samples(data, chunk_size);
114 if training_samples.len() < 5 {
115 return None;
116 }
117
118 let max_dict = dict_max_size(data.len());
119
120 let dict = zstd::dict::from_samples(&training_samples, max_dict).ok()?;
122 if dict.is_empty() {
123 return None;
124 }
125
126 let chunks = split_into_chunks(data, chunk_size);
128
129 let mut compressor = zstd::bulk::Compressor::with_dictionary(level, &dict).ok()?;
131 let mut compressed_chunks: Vec<Vec<u8>> = Vec::with_capacity(chunks.len());
132 for chunk in &chunks {
133 let cc = compressor.compress(chunk).ok()?;
134 compressed_chunks.push(cc);
135 }
136
137 let total_compressed: usize = compressed_chunks.iter().map(|c| 4 + c.len()).sum();
142 let payload_size = 4 + dict.len() + 4 + total_compressed;
143
144 if payload_size >= plain_size {
146 return None;
147 }
148
149 let mut payload = Vec::with_capacity(payload_size);
150 payload.extend_from_slice(&(dict.len() as u32).to_le_bytes());
151 payload.extend_from_slice(&dict);
152 payload.extend_from_slice(&(compressed_chunks.len() as u32).to_le_bytes());
153 for cc in &compressed_chunks {
154 payload.extend_from_slice(&(cc.len() as u32).to_le_bytes());
155 payload.extend_from_slice(cc);
156 }
157
158 Some(payload)
159}
160
161fn decompress_with_dict(payload: &[u8], capacity: usize) -> std::io::Result<Vec<u8>> {
170 if payload.len() < 4 {
171 return Err(io::Error::new(
172 io::ErrorKind::InvalidData,
173 "dict payload too short for dict_size",
174 ));
175 }
176 let mut pos = 0;
177
178 let dict_size =
180 u32::from_le_bytes(payload[pos..pos + 4].try_into().expect("4-byte slice")) as usize;
181 pos += 4;
182 if payload.len() < pos + dict_size {
183 return Err(io::Error::new(
184 io::ErrorKind::InvalidData,
185 "dict payload truncated: dictionary bytes",
186 ));
187 }
188 let dict_bytes = &payload[pos..pos + dict_size];
189 pos += dict_size;
190
191 if payload.len() < pos + 4 {
193 return Err(io::Error::new(
194 io::ErrorKind::InvalidData,
195 "dict payload truncated: num_chunks",
196 ));
197 }
198 let num_chunks =
199 u32::from_le_bytes(payload[pos..pos + 4].try_into().expect("4-byte slice")) as usize;
200 pos += 4;
201
202 let mut decompressor = zstd::bulk::Decompressor::with_dictionary(dict_bytes)
204 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
205
206 let mut output = Vec::with_capacity(capacity);
207
208 for i in 0..num_chunks {
209 if payload.len() < pos + 4 {
210 return Err(io::Error::new(
211 io::ErrorKind::InvalidData,
212 format!("dict payload truncated: chunk {i} size"),
213 ));
214 }
215 let chunk_size =
216 u32::from_le_bytes(payload[pos..pos + 4].try_into().expect("4-byte slice")) as usize;
217 pos += 4;
218 if payload.len() < pos + chunk_size {
219 return Err(io::Error::new(
220 io::ErrorKind::InvalidData,
221 format!("dict payload truncated: chunk {i} data"),
222 ));
223 }
224 let chunk_data = &payload[pos..pos + chunk_size];
225 pos += chunk_size;
226
227 let chunk_capacity = capacity.saturating_sub(output.len());
229 let decompressed = decompressor
230 .decompress(chunk_data, chunk_capacity)
231 .map_err(|e| {
232 io::Error::new(
233 io::ErrorKind::InvalidData,
234 format!("chunk {i} decompress failed: {e}"),
235 )
236 })?;
237 output.extend_from_slice(&decompressed);
238 }
239
240 Ok(output)
241}
242
243const BROTLI_MODE_GENERIC: u32 = 0;
249const BROTLI_MODE_TEXT: u32 = 1;
250
251fn brotli_compress(data: &[u8], quality: u32, mode: u32) -> io::Result<Vec<u8>> {
254 use brotli::enc::backward_references::BrotliEncoderMode;
255 let mut output = Vec::new();
256 let brotli_mode = match mode {
257 1 => BrotliEncoderMode::BROTLI_MODE_TEXT,
258 _ => BrotliEncoderMode::BROTLI_MODE_GENERIC,
259 };
260 let params = brotli::enc::BrotliEncoderParams {
261 quality: quality as i32,
262 mode: brotli_mode,
263 ..Default::default()
264 };
265 brotli::BrotliCompress(&mut io::Cursor::new(data), &mut output, ¶ms)?;
266 Ok(output)
267}
268
269fn brotli_decompress(data: &[u8]) -> io::Result<Vec<u8>> {
271 let mut output = Vec::new();
272 brotli::BrotliDecompress(&mut io::Cursor::new(data), &mut output)?;
273 Ok(output)
274}
275
276fn cm_compress(data: &[u8], config: CMConfig) -> Vec<u8> {
279 let mut engine = CMEngine::with_config(config);
280 let mut encoder = ArithmeticEncoder::new();
281
282 for &byte in data {
283 for bpos in 0..8 {
284 let bit = (byte >> (7 - bpos)) & 1;
285 let p = engine.predict();
286 encoder.encode(bit, p);
287 engine.update(bit);
288 }
289 }
290
291 encoder.finish()
292}
293
294fn cm_decompress(compressed: &[u8], original_size: usize, config: CMConfig) -> Vec<u8> {
297 let mut engine = CMEngine::with_config(config);
298 let mut decoder = ArithmeticDecoder::new(compressed);
299 let mut output = Vec::with_capacity(original_size);
300
301 for _ in 0..original_size {
302 let mut byte_val: u8 = 0;
303 for bpos in 0..8 {
304 let p = engine.predict();
305 let bit = decoder.decode(p);
306 engine.update(bit);
307 byte_val |= bit << (7 - bpos);
308 }
309 output.push(byte_val);
310 }
311
312 output
313}
314
315fn gru_compress(data: &[u8], config: CMConfig) -> Vec<u8> {
323 let mut engine = CMEngine::with_config(config);
324 let mut gru = GruModel::new();
325 let mut meta_mixer = MetaMixer::new(12); let mut encoder = ArithmeticEncoder::new();
327
328 let total_bytes = data.len();
329 let report_interval = if total_bytes > 100_000 {
330 total_bytes / 20
331 } else {
332 0
333 };
334
335 for (byte_idx, &byte) in data.iter().enumerate() {
336 for bpos in 0..8u8 {
337 let bit = (byte >> (7 - bpos)) & 1;
338
339 let p_cm = engine.predict();
341
342 let partial = if bpos == 0 {
344 1u32
345 } else {
346 let mut p = 1u32;
347 for prev_bpos in 0..bpos {
348 let prev_bit = (byte >> (7 - prev_bpos)) & 1;
349 p = (p << 1) | prev_bit as u32;
350 }
351 p
352 };
353 let p_gru = gru.predict_bit(bpos, partial);
354
355 let p_final = meta_mixer.blend(p_cm, p_gru);
357
358 encoder.encode(bit, p_final);
359 engine.update(bit);
360 meta_mixer.update(bit);
361 }
362
363 gru.train(byte);
365 gru.forward(byte);
366
367 if report_interval > 0 && (byte_idx + 1) % report_interval == 0 {
368 let pct = (byte_idx + 1) * 100 / total_bytes;
369 eprint!("\r[gru] compressing... {pct}%");
370 }
371 }
372
373 if total_bytes > 100_000 {
374 eprintln!("\r[gru] compressing... 100%");
375 }
376
377 encoder.finish()
378}
379
380fn gru_decompress(compressed: &[u8], original_size: usize, config: CMConfig) -> Vec<u8> {
383 let mut engine = CMEngine::with_config(config);
384 let mut gru = GruModel::new();
385 let mut meta_mixer = MetaMixer::new(12); let mut decoder = ArithmeticDecoder::new(compressed);
387 let mut output = Vec::with_capacity(original_size);
388
389 let report_interval = if original_size > 100_000 {
390 original_size / 20
391 } else {
392 0
393 };
394
395 for byte_idx in 0..original_size {
396 let mut byte_val: u8 = 0;
397
398 for bpos in 0..8u8 {
399 let p_cm = engine.predict();
401
402 let partial = if bpos == 0 {
404 1u32
405 } else {
406 let mut p = 1u32;
407 for prev_bpos in 0..bpos {
408 let prev_bit = (byte_val >> (7 - prev_bpos)) & 1;
409 p = (p << 1) | prev_bit as u32;
410 }
411 p
412 };
413 let p_gru = gru.predict_bit(bpos, partial);
414
415 let p_final = meta_mixer.blend(p_cm, p_gru);
417
418 let bit = decoder.decode(p_final);
419 engine.update(bit);
420 meta_mixer.update(bit);
421 byte_val |= bit << (7 - bpos);
422 }
423
424 output.push(byte_val);
425
426 gru.train(byte_val);
428 gru.forward(byte_val);
429
430 if report_interval > 0 && (byte_idx + 1) % report_interval == 0 {
431 let pct = (byte_idx + 1) * 100 / original_size;
432 eprint!("\r[gru] decompressing... {pct}%");
433 }
434 }
435
436 if original_size > 100_000 {
437 eprintln!("\r[gru] decompressing... 100%");
438 }
439
440 output
441}
442
443#[cfg(feature = "neural")]
451fn neural_compress(
452 data: &[u8],
453 config: CMConfig,
454 llm: &mut datacortex_neural::LlmPredictor,
455 meta_mixer: &mut datacortex_neural::MetaMixer,
456) -> Vec<u8> {
457 let mut engine = CMEngine::with_config(config);
458 let mut encoder = ArithmeticEncoder::new();
459
460 let total_bytes = data.len();
466 let mut bytes_processed = 0;
467 let report_interval = total_bytes / 20; for (byte_idx, &byte) in data.iter().enumerate() {
470 for bpos in 0..8u8 {
474 let bit = (byte >> (7 - bpos)) & 1;
475
476 let p_cm = engine.predict();
478
479 let partial = if bpos == 0 {
482 1u32
483 } else {
484 let mut p = 1u32;
486 for prev_bpos in 0..bpos {
487 let prev_bit = (byte >> (7 - prev_bpos)) & 1;
488 p = (p << 1) | prev_bit as u32;
489 }
490 p
491 };
492 let p_llm = llm.predict_bit(bpos, partial);
493
494 let p_final = meta_mixer.blend(p_cm, p_llm);
496
497 encoder.encode(bit, p_final);
498 engine.update(bit);
499 meta_mixer.update(bit);
500 }
501
502 if let Err(e) = llm.predict_byte_probs(byte) {
504 if byte_idx < 5 {
506 eprintln!("[neural] LLM predict error at byte {byte_idx}: {e}");
507 }
508 }
509
510 bytes_processed += 1;
511 if report_interval > 0 && bytes_processed % report_interval == 0 {
512 let pct = bytes_processed * 100 / total_bytes;
513 eprint!("\r[neural] compressing... {pct}%");
514 }
515 }
516
517 if total_bytes > 1000 {
518 eprintln!("\r[neural] compressing... 100%");
519 }
520
521 encoder.finish()
522}
523
524#[cfg(feature = "neural")]
527fn neural_decompress(
528 compressed: &[u8],
529 original_size: usize,
530 config: CMConfig,
531 llm: &mut datacortex_neural::LlmPredictor,
532 meta_mixer: &mut datacortex_neural::MetaMixer,
533) -> Vec<u8> {
534 let mut engine = CMEngine::with_config(config);
535 let mut decoder = ArithmeticDecoder::new(compressed);
536 let mut output = Vec::with_capacity(original_size);
537
538 let report_interval = if original_size > 0 {
539 original_size / 20
540 } else {
541 1
542 };
543
544 for byte_idx in 0..original_size {
545 let mut byte_val: u8 = 0;
546
547 for bpos in 0..8u8 {
548 let p_cm = engine.predict();
550
551 let partial = if bpos == 0 {
553 1u32
554 } else {
555 let mut p = 1u32;
557 for prev_bpos in 0..bpos {
558 let prev_bit = (byte_val >> (7 - prev_bpos)) & 1;
559 p = (p << 1) | prev_bit as u32;
560 }
561 p
562 };
563 let p_llm = llm.predict_bit(bpos, partial);
564
565 let p_final = meta_mixer.blend(p_cm, p_llm);
567
568 let bit = decoder.decode(p_final);
569 engine.update(bit);
570 meta_mixer.update(bit);
571 byte_val |= bit << (7 - bpos);
572 }
573
574 output.push(byte_val);
575
576 if let Err(e) = llm.predict_byte_probs(byte_val) {
578 if byte_idx < 5 {
579 eprintln!("[neural] LLM predict error at byte {byte_idx}: {e}");
580 }
581 }
582
583 if report_interval > 0 && (byte_idx + 1) % report_interval == 0 {
584 let pct = (byte_idx + 1) * 100 / original_size;
585 eprint!("\r[neural] decompressing... {pct}%");
586 }
587 }
588
589 if original_size > 1000 {
590 eprintln!("\r[neural] decompressing... 100%");
591 }
592
593 output
594}
595
596fn cm_config_for_mode(mode: Mode) -> CMConfig {
598 match mode {
599 Mode::Max => CMConfig::max(),
600 Mode::Balanced => CMConfig::balanced(),
601 Mode::Fast => CMConfig::balanced(), }
603}
604
605#[cfg(feature = "neural")]
610fn resolve_model_path(explicit: Option<&str>) -> Option<String> {
611 if let Some(p) = explicit {
612 if std::path::Path::new(p).exists() {
613 return Some(p.to_string());
614 }
615 eprintln!("[neural] explicit model path not found: {p}");
616 return None;
617 }
618
619 if let Ok(p) = std::env::var("DATACORTEX_MODEL") {
620 if p.is_empty() {
621 return None;
623 }
624 if std::path::Path::new(&p).exists() {
625 return Some(p);
626 }
627 eprintln!("[neural] DATACORTEX_MODEL path not found: {p}");
628 return None; }
630
631 if let Some(home) = std::env::var_os("HOME") {
633 let default = format!(
634 "{}/.datacortex/models/SmolLM2-135M-Instruct-Q8_0.gguf",
635 home.to_string_lossy()
636 );
637 if std::path::Path::new(&default).exists() {
638 return Some(default);
639 }
640 }
641
642 None
643}
644
645pub fn compress<W: Write>(
647 data: &[u8],
648 mode: Mode,
649 format_override: Option<FormatHint>,
650 output: &mut W,
651) -> io::Result<()> {
652 compress_with_model(data, mode, format_override, None, output)
653}
654
655pub fn compress_with_model<W: Write>(
657 data: &[u8],
658 mode: Mode,
659 format_override: Option<FormatHint>,
660 model_path: Option<&str>,
661 output: &mut W,
662) -> io::Result<()> {
663 compress_with_options(data, mode, format_override, model_path, None, output)
664}
665
666pub fn compress_with_options<W: Write>(
668 data: &[u8],
669 mode: Mode,
670 format_override: Option<FormatHint>,
671 model_path: Option<&str>,
672 zstd_level_override: Option<i32>,
673 output: &mut W,
674) -> io::Result<()> {
675 let format_hint = format_override.unwrap_or_else(|| detect_format(data));
676 let crc = crc32fast::hash(data);
677
678 let (preprocessed, chain) = preprocess(data, format_hint, mode);
680 let transform_metadata = if chain.is_empty() {
681 vec![]
682 } else {
683 chain.serialize()
684 };
685
686 let mut use_dict = false;
688 let mut use_brotli = false;
689 let mut use_raw_fallback = false;
691 let mut use_meta_embedded = false;
693 let compressed = match mode {
694 Mode::Fast => {
710 let level = adaptive_fast_level(preprocessed.len(), zstd_level_override);
711
712 let plain_a = zstd::bulk::compress(&preprocessed, level).map_err(io::Error::other)?;
714
715 let (compressed_a, dict_a) = if preprocessed.len() >= DICT_MIN_DATA_SIZE {
716 if let Some(dict_payload) = try_dict_compress(&preprocessed, level, plain_a.len()) {
717 (dict_payload, true)
718 } else {
719 (plain_a, false)
720 }
721 } else {
722 (plain_a, false)
723 };
724
725 let meta_size_for_comparison = if transform_metadata.len() > 64 {
728 let compressed_meta = zstd::bulk::compress(&transform_metadata, 19)
729 .unwrap_or_else(|_| transform_metadata.clone());
730 if compressed_meta.len() < transform_metadata.len() {
731 compressed_meta.len()
732 } else {
733 transform_metadata.len()
734 }
735 } else {
736 transform_metadata.len()
737 };
738
739 let total_a = 32 + meta_size_for_comparison + compressed_a.len();
741
742 let raw_level = adaptive_fast_level(data.len(), zstd_level_override);
745 let compressed_b = zstd::bulk::compress(data, raw_level).map_err(io::Error::other)?;
746
747 let total_b = 32 + compressed_b.len();
749
750 let (
752 mut best_compressed,
753 mut best_total,
754 mut best_dict,
755 mut best_raw,
756 mut best_brotli,
757 mut best_embedded,
758 ) = if total_b < total_a {
759 (compressed_b, total_b, false, true, false, false)
760 } else {
761 (compressed_a, total_a, dict_a, false, false, false)
762 };
763
764 let brotli_quality = if data.len() <= 1_048_576 { 11 } else { 9 };
767 if let Ok(brotli_raw) = brotli_compress(data, brotli_quality, BROTLI_MODE_TEXT) {
768 let brotli_raw_total = 32 + brotli_raw.len();
769 if brotli_raw_total < best_total {
770 best_compressed = brotli_raw;
771 best_total = brotli_raw_total;
772 best_dict = false;
773 best_raw = true;
774 best_brotli = true;
775 best_embedded = false;
776 }
777 }
778
779 {
782 let brotli_prep_max_q = if preprocessed.len() <= 1_048_576 {
783 11
784 } else {
785 9
786 };
787 let qualities = if brotli_prep_max_q == 11 {
788 &[11u32, 10][..]
789 } else {
790 &[brotli_prep_max_q as u32][..]
791 };
792 for &q in qualities {
793 if let Ok(brotli_prep) = brotli_compress(&preprocessed, q, BROTLI_MODE_GENERIC)
794 {
795 let brotli_prep_total = 32 + meta_size_for_comparison + brotli_prep.len();
796 if brotli_prep_total < best_total {
797 best_compressed = brotli_prep;
798 best_total = brotli_prep_total;
799 best_dict = false;
800 best_raw = false;
801 best_brotli = true;
802 best_embedded = false;
803 }
804 }
805 }
806 }
807
808 let embedded_payload = if !transform_metadata.is_empty() {
810 let mut ep = Vec::with_capacity(4 + transform_metadata.len() + preprocessed.len());
811 ep.extend_from_slice(&(transform_metadata.len() as u32).to_le_bytes());
812 ep.extend_from_slice(&transform_metadata);
813 ep.extend_from_slice(&preprocessed);
814 Some(ep)
815 } else {
816 None
817 };
818
819 if let Some(ref embedded_payload) = embedded_payload {
827 let embed_max_q = if embedded_payload.len() <= 1_048_576 {
828 11
829 } else {
830 9
831 };
832 let qualities = if embed_max_q == 11 {
833 &[11u32, 10][..]
834 } else {
835 &[embed_max_q as u32][..]
836 };
837 for &q in qualities {
838 if let Ok(brotli_embedded) =
839 brotli_compress(embedded_payload, q, BROTLI_MODE_GENERIC)
840 {
841 let brotli_embedded_total = 32 + brotli_embedded.len();
843 if brotli_embedded_total < best_total {
844 best_compressed = brotli_embedded;
845 best_total = brotli_embedded_total;
846 best_dict = false;
847 best_raw = false;
848 best_brotli = true;
849 best_embedded = true;
850 }
851 }
852 }
853 }
854
855 if let Some(ref embedded_payload) = embedded_payload {
862 let embed_level = adaptive_fast_level(embedded_payload.len(), zstd_level_override);
863 if let Ok(zstd_embedded) = zstd::bulk::compress(embedded_payload, embed_level) {
864 let zstd_embedded_total = 32 + zstd_embedded.len();
866 if zstd_embedded_total < best_total {
867 best_compressed = zstd_embedded;
868 best_total = zstd_embedded_total;
869 best_dict = false;
870 best_raw = false;
871 best_brotli = false;
872 best_embedded = true;
873 }
874 }
875 }
876
877 let _ = best_total; use_dict = best_dict;
879 use_raw_fallback = best_raw;
880 use_brotli = best_brotli;
881 use_meta_embedded = best_embedded;
882 best_compressed
883 }
884 Mode::Balanced => {
886 let config = cm_config_for_mode(mode);
887 let cm_data = gru_compress(&preprocessed, config);
888 let mut payload = Vec::with_capacity(8 + cm_data.len());
889 payload.extend_from_slice(&(preprocessed.len() as u64).to_le_bytes());
890 payload.extend_from_slice(&cm_data);
891 payload
892 }
893 Mode::Max => {
895 let config = cm_config_for_mode(mode);
896
897 #[cfg(feature = "neural")]
898 {
899 if let Some(mpath) = resolve_model_path(model_path) {
900 match datacortex_neural::LlmPredictor::new(&mpath) {
901 Ok(mut llm) => {
902 let mut meta_mixer = datacortex_neural::MetaMixer::new(5);
903 eprintln!(
904 "[neural] Max mode: dual-path CM+LLM ({} bytes mapped)",
905 llm.mapped_bytes()
906 );
907 let cm_data =
908 neural_compress(&preprocessed, config, &mut llm, &mut meta_mixer);
909 let mut payload = Vec::with_capacity(8 + cm_data.len());
910 let size_with_flag = preprocessed.len() as u64 | (1u64 << 63);
913 payload.extend_from_slice(&size_with_flag.to_le_bytes());
914 payload.extend_from_slice(&cm_data);
915 payload
916 }
917 Err(e) => {
918 eprintln!("[neural] LLM init failed, falling back to CM-only: {e}");
919 let cm_data = cm_compress(&preprocessed, config);
920 let mut payload = Vec::with_capacity(8 + cm_data.len());
921 payload.extend_from_slice(&(preprocessed.len() as u64).to_le_bytes());
922 payload.extend_from_slice(&cm_data);
923 payload
924 }
925 }
926 } else {
927 eprintln!(
928 "[neural] no model found, Max mode using CM-only. \
929 Set DATACORTEX_MODEL or use --model-path."
930 );
931 let cm_data = cm_compress(&preprocessed, config);
932 let mut payload = Vec::with_capacity(8 + cm_data.len());
933 payload.extend_from_slice(&(preprocessed.len() as u64).to_le_bytes());
934 payload.extend_from_slice(&cm_data);
935 payload
936 }
937 }
938
939 #[cfg(not(feature = "neural"))]
940 {
941 let _ = model_path; let cm_data = cm_compress(&preprocessed, config);
943 let mut payload = Vec::with_capacity(8 + cm_data.len());
944 payload.extend_from_slice(&(preprocessed.len() as u64).to_le_bytes());
945 payload.extend_from_slice(&cm_data);
946 payload
947 }
948 }
949 };
950
951 let final_metadata = if use_raw_fallback || use_meta_embedded {
955 vec![]
956 } else {
957 transform_metadata
958 };
959
960 let (header_metadata, meta_compressed) = if final_metadata.len() > 64 {
964 let compressed_meta =
965 zstd::bulk::compress(&final_metadata, 19).unwrap_or_else(|_| final_metadata.clone());
966 if compressed_meta.len() < final_metadata.len() {
967 (compressed_meta, true)
968 } else {
969 (final_metadata, false)
970 }
971 } else {
972 (final_metadata, false)
973 };
974
975 let header = DcxHeader {
976 mode,
977 format_hint,
978 original_size: data.len() as u64,
979 compressed_size: compressed.len() as u64,
980 crc32: crc,
981 transform_metadata: header_metadata,
982 has_dict: use_dict,
983 meta_compressed,
984 use_brotli,
985 meta_embedded: use_meta_embedded,
986 };
987
988 header.write_to(output)?;
989 output.write_all(&compressed)?;
990
991 Ok(())
992}
993
994pub fn decompress<R: Read>(input: &mut R) -> io::Result<Vec<u8>> {
996 decompress_with_model(input, None)
997}
998
999pub fn decompress_with_model<R: Read>(
1001 input: &mut R,
1002 model_path: Option<&str>,
1003) -> io::Result<Vec<u8>> {
1004 let header = DcxHeader::read_from(input)?;
1005
1006 let mut compressed = vec![0u8; header.compressed_size as usize];
1007 input.read_exact(&mut compressed)?;
1008
1009 let preprocessed = match header.mode {
1011 Mode::Fast => {
1012 if header.use_brotli {
1013 brotli_decompress(&compressed)?
1014 } else {
1015 let capacity = header.original_size as usize * 2 + 65536;
1016 if header.has_dict {
1017 decompress_with_dict(&compressed, capacity)?
1018 } else {
1019 zstd::bulk::decompress(&compressed, capacity)
1020 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?
1021 }
1022 }
1023 }
1024 Mode::Balanced => {
1025 if compressed.len() < 8 {
1027 return Err(io::Error::new(
1028 io::ErrorKind::InvalidData,
1029 "CM mode compressed data too short",
1030 ));
1031 }
1032 let size_raw = u64::from_le_bytes(compressed[..8].try_into().expect("8-byte slice"));
1033 let preprocessed_size = (size_raw & !(1u64 << 63)) as usize;
1034 let config = cm_config_for_mode(header.mode);
1035 gru_decompress(&compressed[8..], preprocessed_size, config)
1036 }
1037 Mode::Max => {
1038 if compressed.len() < 8 {
1040 return Err(io::Error::new(
1041 io::ErrorKind::InvalidData,
1042 "CM mode compressed data too short",
1043 ));
1044 }
1045 let size_raw = u64::from_le_bytes(compressed[..8].try_into().expect("8-byte slice"));
1046
1047 let neural_flag = size_raw & (1u64 << 63) != 0;
1049 let preprocessed_size = (size_raw & !(1u64 << 63)) as usize;
1050 let config = cm_config_for_mode(header.mode);
1051
1052 if neural_flag {
1053 #[cfg(feature = "neural")]
1054 {
1055 if let Some(mpath) = resolve_model_path(model_path) {
1056 match datacortex_neural::LlmPredictor::new(&mpath) {
1057 Ok(mut llm) => {
1058 let mut meta_mixer = datacortex_neural::MetaMixer::new(5);
1059 eprintln!(
1060 "[neural] decompressing with dual-path CM+LLM ({} bytes mapped)",
1061 llm.mapped_bytes()
1062 );
1063 neural_decompress(
1064 &compressed[8..],
1065 preprocessed_size,
1066 config,
1067 &mut llm,
1068 &mut meta_mixer,
1069 )
1070 }
1071 Err(e) => {
1072 return Err(io::Error::new(
1073 io::ErrorKind::Other,
1074 format!(
1075 "file was compressed with neural mode but LLM failed to load: {e}"
1076 ),
1077 ));
1078 }
1079 }
1080 } else {
1081 return Err(io::Error::new(
1082 io::ErrorKind::Other,
1083 "file was compressed with neural mode but no model found. \
1084 Set DATACORTEX_MODEL or use --model-path.",
1085 ));
1086 }
1087 }
1088
1089 #[cfg(not(feature = "neural"))]
1090 {
1091 let _ = model_path;
1092 return Err(io::Error::other(
1093 "file was compressed with neural mode but this build lacks the \
1094 `neural` feature. Rebuild with --features neural.",
1095 ));
1096 }
1097 } else {
1098 cm_decompress(&compressed[8..], preprocessed_size, config)
1099 }
1100 }
1101 };
1102
1103 let (preprocessed, transform_metadata) = if header.meta_embedded {
1108 if preprocessed.len() < 4 {
1109 return Err(io::Error::new(
1110 io::ErrorKind::InvalidData,
1111 "embedded metadata: decompressed stream too short for meta_len",
1112 ));
1113 }
1114 let meta_len =
1115 u32::from_le_bytes(preprocessed[0..4].try_into().expect("4-byte slice")) as usize;
1116 if preprocessed.len() < 4 + meta_len {
1117 return Err(io::Error::new(
1118 io::ErrorKind::InvalidData,
1119 format!(
1120 "embedded metadata: stream too short for metadata ({} bytes needed, {} available)",
1121 4 + meta_len,
1122 preprocessed.len()
1123 ),
1124 ));
1125 }
1126 let metadata = preprocessed[4..4 + meta_len].to_vec();
1127 let actual_preprocessed = preprocessed[4 + meta_len..].to_vec();
1128 (actual_preprocessed, metadata)
1129 } else {
1130 let tm = if header.meta_compressed && !header.transform_metadata.is_empty() {
1133 let mut decoder =
1134 zstd::Decoder::new(Cursor::new(&header.transform_metadata)).map_err(|e| {
1135 io::Error::new(
1136 io::ErrorKind::InvalidData,
1137 format!("failed to init metadata decompressor: {e}"),
1138 )
1139 })?;
1140 let mut decompressed_meta = Vec::new();
1141 decoder.read_to_end(&mut decompressed_meta).map_err(|e| {
1142 io::Error::new(
1143 io::ErrorKind::InvalidData,
1144 format!("failed to decompress transform metadata: {e}"),
1145 )
1146 })?;
1147 decompressed_meta
1148 } else {
1149 header.transform_metadata.clone()
1150 };
1151 (preprocessed, tm)
1152 };
1153
1154 let data = if transform_metadata.is_empty() {
1156 preprocessed
1157 } else {
1158 let chain = TransformChain::deserialize(&transform_metadata)?;
1159 reverse_preprocess(&preprocessed, &chain)
1160 };
1161
1162 let crc = crc32fast::hash(&data);
1164 if crc != header.crc32 {
1165 return Err(io::Error::new(
1166 io::ErrorKind::InvalidData,
1167 format!(
1168 "CRC-32 mismatch: expected {:#010X}, got {:#010X}",
1169 header.crc32, crc
1170 ),
1171 ));
1172 }
1173
1174 if data.len() as u64 != header.original_size {
1175 return Err(io::Error::new(
1176 io::ErrorKind::InvalidData,
1177 format!(
1178 "size mismatch: header says {} bytes, got {}",
1179 header.original_size,
1180 data.len()
1181 ),
1182 ));
1183 }
1184
1185 Ok(data)
1186}
1187
1188pub fn compress_to_vec(
1190 data: &[u8],
1191 mode: Mode,
1192 format_override: Option<FormatHint>,
1193) -> io::Result<Vec<u8>> {
1194 let mut buf = Vec::new();
1195 compress(data, mode, format_override, &mut buf)?;
1196 Ok(buf)
1197}
1198
1199pub fn compress_to_vec_with_model(
1201 data: &[u8],
1202 mode: Mode,
1203 format_override: Option<FormatHint>,
1204 model_path: Option<&str>,
1205) -> io::Result<Vec<u8>> {
1206 let mut buf = Vec::new();
1207 compress_with_model(data, mode, format_override, model_path, &mut buf)?;
1208 Ok(buf)
1209}
1210
1211pub fn compress_to_vec_with_options(
1213 data: &[u8],
1214 mode: Mode,
1215 format_override: Option<FormatHint>,
1216 model_path: Option<&str>,
1217 zstd_level_override: Option<i32>,
1218) -> io::Result<Vec<u8>> {
1219 let mut buf = Vec::new();
1220 compress_with_options(
1221 data,
1222 mode,
1223 format_override,
1224 model_path,
1225 zstd_level_override,
1226 &mut buf,
1227 )?;
1228 Ok(buf)
1229}
1230
1231pub fn decompress_from_slice(dcx_data: &[u8]) -> io::Result<Vec<u8>> {
1233 let mut cursor = Cursor::new(dcx_data);
1234 decompress(&mut cursor)
1235}
1236
1237pub fn read_header<R: Read>(input: &mut R) -> io::Result<DcxHeader> {
1239 DcxHeader::read_from(input)
1240}
1241
1242pub fn raw_zstd_compress(data: &[u8], level: i32) -> io::Result<Vec<u8>> {
1244 zstd::bulk::compress(data, level).map_err(io::Error::other)
1245}
1246
1247#[cfg(test)]
1248mod tests {
1249 use super::*;
1250
1251 #[test]
1252 fn fast_mode_roundtrip() {
1253 let original = b"Hello, DataCortex! This is a test of Fast mode compression.";
1254 let compressed = compress_to_vec(original, Mode::Fast, None).unwrap();
1255 let decompressed = decompress_from_slice(&compressed).unwrap();
1256 assert_eq!(decompressed, original);
1257 }
1258
1259 #[test]
1260 fn fast_mode_json_roundtrip() {
1261 let data = br#"{"name":"Alice","age":30,"name":"Bob","age":25,"name":"Carol","age":35}"#;
1262 let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1263 let decompressed = decompress_from_slice(&compressed).unwrap();
1264 assert_eq!(decompressed, data.to_vec());
1265 }
1266
1267 #[test]
1268 fn balanced_mode_roundtrip() {
1269 let original = b"Balanced mode test data with some content.";
1270 let compressed = compress_to_vec(original, Mode::Balanced, None).unwrap();
1271 let decompressed = decompress_from_slice(&compressed).unwrap();
1272 assert_eq!(decompressed, original);
1273 }
1274
1275 #[test]
1276 fn balanced_mode_longer_text() {
1277 let original = b"The quick brown fox jumps over the lazy dog. This sentence contains every letter of the English alphabet at least once. We need enough data to properly exercise the arithmetic coder and order-0 model.";
1278 let compressed = compress_to_vec(original, Mode::Balanced, None).unwrap();
1279 let decompressed = decompress_from_slice(&compressed).unwrap();
1280 assert_eq!(decompressed, original);
1281 }
1282
1283 #[test]
1284 fn balanced_mode_repetitive_data() {
1285 let data = "hello world! ".repeat(100);
1286 let compressed = compress_to_vec(data.as_bytes(), Mode::Balanced, None).unwrap();
1287 let decompressed = decompress_from_slice(&compressed).unwrap();
1288 assert_eq!(decompressed, data.as_bytes());
1289 }
1290
1291 #[test]
1292 fn balanced_mode_all_byte_values() {
1293 let original: Vec<u8> = (0..=255).collect();
1294 let compressed = compress_to_vec(&original, Mode::Balanced, None).unwrap();
1295 let decompressed = decompress_from_slice(&compressed).unwrap();
1296 assert_eq!(decompressed, original);
1297 }
1298
1299 #[test]
1300 fn balanced_mode_single_byte() {
1301 let original = b"X";
1302 let compressed = compress_to_vec(original, Mode::Balanced, None).unwrap();
1303 let decompressed = decompress_from_slice(&compressed).unwrap();
1304 assert_eq!(decompressed, original);
1305 }
1306
1307 #[test]
1308 fn balanced_mode_json_roundtrip() {
1309 let data = br#"{"name":"Alice","age":30,"name":"Bob","age":25,"name":"Carol","age":35}"#;
1310 let compressed = compress_to_vec(data, Mode::Balanced, Some(FormatHint::Json)).unwrap();
1311 let decompressed = decompress_from_slice(&compressed).unwrap();
1312 assert_eq!(decompressed, data.to_vec());
1313 }
1314
1315 #[test]
1316 fn empty_data_roundtrip() {
1317 let original = b"";
1318 for mode in [Mode::Fast, Mode::Balanced, Mode::Max] {
1319 let compressed = compress_to_vec(original, mode, None).unwrap();
1320 let decompressed = decompress_from_slice(&compressed).unwrap();
1321 assert_eq!(decompressed, original, "failed for mode {mode}");
1322 }
1323 }
1324
1325 #[test]
1326 fn crc_mismatch_detected() {
1327 let original = b"test data for CRC check";
1328 let mut compressed = compress_to_vec(original, Mode::Fast, None).unwrap();
1329 let header_size = 32; if compressed.len() > header_size + 5 {
1332 compressed[header_size + 3] ^= 0xFF;
1333 }
1334 assert!(decompress_from_slice(&compressed).is_err());
1335 }
1336
1337 #[test]
1338 fn fast_mode_actually_compresses() {
1339 let data = "hello world. ".repeat(100);
1341 let compressed = compress_to_vec(data.as_bytes(), Mode::Fast, None).unwrap();
1342 assert!(
1343 compressed.len() < data.len(),
1344 "Fast mode should compress repetitive data: {} vs {}",
1345 compressed.len(),
1346 data.len()
1347 );
1348 }
1349
1350 #[test]
1351 fn json_preprocessing_improves_fast_mode() {
1352 let data = br#"[{"name":"Alice","score":95},{"name":"Bob","score":87},{"name":"Carol","score":92},{"name":"Dave","score":88},{"name":"Eve","score":91}]"#;
1353 let with_preprocess = compress_to_vec(data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1354 let without_preprocess =
1355 compress_to_vec(data, Mode::Fast, Some(FormatHint::Generic)).unwrap();
1356
1357 assert_eq!(
1359 decompress_from_slice(&with_preprocess).unwrap(),
1360 data.to_vec()
1361 );
1362 assert_eq!(
1363 decompress_from_slice(&without_preprocess).unwrap(),
1364 data.to_vec()
1365 );
1366 }
1367
1368 #[test]
1369 fn all_modes_roundtrip() {
1370 let data = b"test all modes with some more content to ensure decent compression";
1371 for mode in [Mode::Max, Mode::Balanced, Mode::Fast] {
1372 let compressed = compress_to_vec(data, mode, None).unwrap();
1373 let decompressed = decompress_from_slice(&compressed).unwrap();
1374 assert_eq!(decompressed, data, "failed for mode {mode}");
1375 }
1376 }
1377
1378 #[test]
1379 fn cm_compress_decompress_direct() {
1380 let data = b"Hello, World! This is a direct CM test.";
1381 let compressed = cm_compress(data, CMConfig::balanced());
1382 let decompressed = cm_decompress(&compressed, data.len(), CMConfig::balanced());
1383 assert_eq!(decompressed, data.to_vec());
1384 }
1385
1386 #[test]
1387 fn cm_empty() {
1388 let data: &[u8] = b"";
1389 let compressed = cm_compress(data, CMConfig::balanced());
1390 let decompressed = cm_decompress(&compressed, 0, CMConfig::balanced());
1391 assert!(decompressed.is_empty());
1392 }
1393
1394 #[test]
1395 fn cm_single_byte() {
1396 for byte in 0..=255u8 {
1397 let data = [byte];
1398 let compressed = cm_compress(&data, CMConfig::balanced());
1399 let decompressed = cm_decompress(&compressed, 1, CMConfig::balanced());
1400 assert_eq!(
1401 decompressed, data,
1402 "CM roundtrip failed for byte {byte:#04X}"
1403 );
1404 }
1405 }
1406
1407 #[test]
1408 fn cm_repetitive_compresses() {
1409 let data = vec![b'A'; 1000];
1410 let compressed = cm_compress(&data, CMConfig::balanced());
1411 assert!(
1413 compressed.len() < 200,
1414 "CM should compress 1000 identical bytes well: {} bytes",
1415 compressed.len()
1416 );
1417 let decompressed = cm_decompress(&compressed, data.len(), CMConfig::balanced());
1418 assert_eq!(decompressed, data);
1419 }
1420
1421 #[test]
1422 fn max_mode_roundtrip() {
1423 let original = b"Max mode test data with some content for compression.";
1424 let compressed = compress_to_vec(original, Mode::Max, None).unwrap();
1425 let decompressed = decompress_from_slice(&compressed).unwrap();
1426 assert_eq!(decompressed, original);
1427 }
1428
1429 #[test]
1430 fn max_mode_longer_text() {
1431 let original = b"The quick brown fox jumps over the lazy dog. Max mode uses 2x context maps for better predictions with fewer hash collisions. This should compress slightly better than balanced mode.";
1432 let compressed = compress_to_vec(original, Mode::Max, None).unwrap();
1433 let decompressed = decompress_from_slice(&compressed).unwrap();
1434 assert_eq!(decompressed, original);
1435 }
1436
1437 #[test]
1440 fn test_dict_compress_roundtrip() {
1441 let mut ndjson = String::new();
1444 for i in 0..500 {
1445 ndjson.push_str(&format!(
1446 r#"{{"id":{},"name":"user_{}","status":"active","score":{}}}"#,
1447 i,
1448 i,
1449 i * 17 % 100
1450 ));
1451 ndjson.push('\n');
1452 }
1453 let data = ndjson.as_bytes();
1454 assert!(
1455 data.len() > DICT_MIN_DATA_SIZE,
1456 "test data should exceed dict threshold: {} bytes",
1457 data.len()
1458 );
1459
1460 let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1461 let decompressed = decompress_from_slice(&compressed).unwrap();
1462 assert_eq!(
1463 decompressed, data,
1464 "dict compress roundtrip: byte-exact mismatch"
1465 );
1466 }
1467
1468 #[test]
1469 fn test_dict_falls_back_on_small() {
1470 let data = b"small data that won't trigger dictionary training";
1472 assert!(data.len() < DICT_MIN_DATA_SIZE);
1473
1474 let compressed = compress_to_vec(data, Mode::Fast, None).unwrap();
1475 let decompressed = decompress_from_slice(&compressed).unwrap();
1476 assert_eq!(decompressed, data.to_vec());
1477
1478 let mut cursor = Cursor::new(&compressed);
1480 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1481 assert!(!header.has_dict, "small data should not have dict flag set");
1482 }
1483
1484 #[test]
1485 fn test_dict_backward_compat() {
1486 let original = b"backward compatibility test data for decompression";
1489 let compressed = compress_to_vec(original, Mode::Fast, None).unwrap();
1490
1491 let mut cursor = Cursor::new(&compressed);
1493 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1494 assert!(!header.has_dict);
1495
1496 let decompressed = decompress_from_slice(&compressed).unwrap();
1498 assert_eq!(decompressed, original.to_vec());
1499 }
1500
1501 #[test]
1502 fn test_dict_ndjson_large_roundtrip() {
1503 let mut ndjson = String::new();
1505 for i in 0..2000 {
1506 ndjson.push_str(&format!(
1507 r#"{{"timestamp":"2025-01-{:02}T{:02}:{:02}:00Z","level":"info","message":"Request processed","request_id":"req_{}","duration_ms":{}}}"#,
1508 (i % 28) + 1,
1509 i % 24,
1510 i % 60,
1511 i,
1512 (i * 13) % 500
1513 ));
1514 ndjson.push('\n');
1515 }
1516 let data = ndjson.as_bytes();
1517
1518 let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1519 let decompressed = decompress_from_slice(&compressed).unwrap();
1520 assert_eq!(decompressed, data, "large NDJSON roundtrip mismatch");
1521 }
1522
1523 #[test]
1524 fn test_dict_generic_data_roundtrip() {
1525 let mut data = Vec::new();
1528 for i in 0..3000 {
1529 data.extend_from_slice(
1530 format!("line {i}: the quick brown fox jumps over the lazy dog\n").as_bytes(),
1531 );
1532 }
1533 assert!(data.len() > DICT_MIN_DATA_SIZE);
1534
1535 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Generic)).unwrap();
1536 let decompressed = decompress_from_slice(&compressed).unwrap();
1537 assert_eq!(decompressed, data, "generic data dict roundtrip mismatch");
1538 }
1539
1540 #[test]
1541 fn test_dict_does_not_affect_other_modes() {
1542 let mut ndjson = String::new();
1545 for i in 0..200 {
1546 ndjson.push_str(&format!(
1547 r#"{{"id":{},"name":"user_{}","status":"active"}}"#,
1548 i, i
1549 ));
1550 ndjson.push('\n');
1551 }
1552 let data = ndjson.as_bytes();
1553
1554 for mode in [Mode::Balanced, Mode::Max] {
1555 let compressed = compress_to_vec(data, mode, Some(FormatHint::Ndjson)).unwrap();
1556 let mut cursor = Cursor::new(&compressed);
1557 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1558 assert!(!header.has_dict, "mode {mode} should never have dict flag");
1559 let decompressed = decompress_from_slice(&compressed).unwrap();
1560 assert_eq!(decompressed, data, "roundtrip failed for mode {mode}");
1561 }
1562 }
1563
1564 #[test]
1567 fn test_compress_with_level() {
1568 let data = "hello world, compressing with custom zstd level. ".repeat(50);
1570 let compressed =
1571 compress_to_vec_with_options(data.as_bytes(), Mode::Fast, None, None, Some(19))
1572 .unwrap();
1573 let decompressed = decompress_from_slice(&compressed).unwrap();
1574 assert_eq!(decompressed, data.as_bytes(), "level 19 roundtrip failed");
1575 }
1576
1577 #[test]
1578 fn test_compress_with_level_default() {
1579 let data = "default level test data. ".repeat(50);
1581 let compressed =
1582 compress_to_vec_with_options(data.as_bytes(), Mode::Fast, None, None, None).unwrap();
1583 let decompressed = decompress_from_slice(&compressed).unwrap();
1584 assert_eq!(
1585 decompressed,
1586 data.as_bytes(),
1587 "default level roundtrip failed"
1588 );
1589 }
1590
1591 #[test]
1592 fn test_compress_with_level_higher_ratio() {
1593 let data = r#"{"name":"Alice","score":95}"#.repeat(200);
1595 let low =
1596 compress_to_vec_with_options(data.as_bytes(), Mode::Fast, None, None, Some(1)).unwrap();
1597 let high = compress_to_vec_with_options(data.as_bytes(), Mode::Fast, None, None, Some(19))
1598 .unwrap();
1599
1600 assert_eq!(decompress_from_slice(&low).unwrap(), data.as_bytes());
1602 assert_eq!(decompress_from_slice(&high).unwrap(), data.as_bytes());
1603
1604 assert!(
1606 high.len() <= low.len(),
1607 "level 19 ({}) should be <= level 1 ({})",
1608 high.len(),
1609 low.len()
1610 );
1611 }
1612
1613 #[test]
1616 fn test_auto_fallback_picks_smaller() {
1617 let data = std::fs::read(concat!(
1621 env!("CARGO_MANIFEST_DIR"),
1622 "/../../corpus/json-bench/citm_catalog.json"
1623 ))
1624 .unwrap();
1625
1626 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1627 let decompressed = decompress_from_slice(&compressed).unwrap();
1628 assert_eq!(decompressed, data, "citm_catalog roundtrip failed");
1629
1630 let ratio = data.len() as f64 / compressed.len() as f64;
1632 assert!(
1633 ratio > 50.0,
1634 "citm_catalog should achieve >50x, got {ratio:.1}x"
1635 );
1636 }
1637
1638 #[test]
1639 fn test_auto_fallback_preprocessed_wins_on_ndjson() {
1640 let data = std::fs::read(concat!(
1643 env!("CARGO_MANIFEST_DIR"),
1644 "/../../corpus/test-ndjson.ndjson"
1645 ))
1646 .unwrap();
1647
1648 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1649 let decompressed = decompress_from_slice(&compressed).unwrap();
1650 assert_eq!(decompressed, data, "test-ndjson roundtrip failed");
1651
1652 let mut cursor = Cursor::new(&compressed);
1655 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1656 assert!(
1657 !header.transform_metadata.is_empty() || header.meta_embedded,
1658 "test-ndjson should prefer preprocessed path (non-empty transform metadata or embedded)"
1659 );
1660 }
1661
1662 #[test]
1663 fn test_auto_fallback_roundtrip() {
1664 let citm = std::fs::read(concat!(
1667 env!("CARGO_MANIFEST_DIR"),
1668 "/../../corpus/json-bench/citm_catalog.json"
1669 ))
1670 .unwrap();
1671 let ndjson = std::fs::read(concat!(
1672 env!("CARGO_MANIFEST_DIR"),
1673 "/../../corpus/test-ndjson.ndjson"
1674 ))
1675 .unwrap();
1676
1677 let compressed_citm = compress_to_vec(&citm, Mode::Fast, Some(FormatHint::Json)).unwrap();
1679 let decompressed_citm = decompress_from_slice(&compressed_citm).unwrap();
1680 assert_eq!(
1681 decompressed_citm, citm,
1682 "citm_catalog roundtrip (raw path) failed"
1683 );
1684
1685 let compressed_ndjson =
1687 compress_to_vec(&ndjson, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1688 let decompressed_ndjson = decompress_from_slice(&compressed_ndjson).unwrap();
1689 assert_eq!(
1690 decompressed_ndjson, ndjson,
1691 "test-ndjson roundtrip (preprocessed path) failed"
1692 );
1693 }
1694
1695 #[test]
1698 fn test_adaptive_level_small_data() {
1699 assert_eq!(adaptive_fast_level(100_000, None), 19);
1701 assert_eq!(adaptive_fast_level(500_000, None), 19);
1702 assert_eq!(adaptive_fast_level(1_048_576, None), 19);
1703 assert_eq!(adaptive_fast_level(0, None), 19);
1704 }
1705
1706 #[test]
1707 fn test_adaptive_level_large_data() {
1708 assert_eq!(adaptive_fast_level(1_048_577, None), 13);
1710 assert_eq!(adaptive_fast_level(5_000_000, None), 13);
1711 assert_eq!(adaptive_fast_level(10_485_760, None), 13);
1712 assert_eq!(adaptive_fast_level(10_485_761, None), 9);
1713 assert_eq!(adaptive_fast_level(100_000_000, None), 9);
1714 }
1715
1716 #[test]
1717 fn test_adaptive_level_override() {
1718 assert_eq!(adaptive_fast_level(100, Some(3)), 3);
1720 assert_eq!(adaptive_fast_level(100_000_000, Some(22)), 22);
1721 assert_eq!(adaptive_fast_level(0, Some(1)), 1);
1722 }
1723
1724 #[test]
1727 fn test_compressed_metadata_roundtrip() {
1728 let mut ndjson = String::new();
1730 for i in 0..500 {
1731 ndjson.push_str(&format!(
1732 r#"{{"id":{},"name":"user_{}","status":"active","score":{}}}"#,
1733 i,
1734 i,
1735 i * 17 % 100
1736 ));
1737 ndjson.push('\n');
1738 }
1739 let data = ndjson.as_bytes();
1740
1741 let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1742 let decompressed = decompress_from_slice(&compressed).unwrap();
1743 assert_eq!(
1744 decompressed, data,
1745 "compressed metadata roundtrip: byte-exact mismatch"
1746 );
1747
1748 let mut cursor = Cursor::new(&compressed);
1750 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1751 if !header.transform_metadata.is_empty() && header.transform_metadata.len() > 10 {
1753 }
1757 }
1758
1759 #[test]
1760 fn test_compressed_metadata_backward_compat() {
1761 let original = b"backward compatibility test data for metadata decompression";
1764 let compressed = compress_to_vec(original, Mode::Fast, None).unwrap();
1765
1766 let decompressed = decompress_from_slice(&compressed).unwrap();
1768 assert_eq!(decompressed, original.to_vec());
1769
1770 let mut cursor = Cursor::new(&compressed);
1772 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1773 assert!(!header.meta_compressed || !header.transform_metadata.is_empty());
1775 }
1776
1777 #[test]
1778 fn test_compressed_metadata_small_skipped() {
1779 let data = br#"{"name":"Alice","age":30}"#;
1782 let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1783 let decompressed = decompress_from_slice(&compressed).unwrap();
1784 assert_eq!(decompressed, data.to_vec());
1785
1786 let mut cursor = Cursor::new(&compressed);
1787 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1788 if header.transform_metadata.len() <= 64 {
1790 assert!(
1791 !header.meta_compressed,
1792 "metadata <= 64 bytes should not be compressed, but meta_compressed=true \
1793 for {} bytes of metadata",
1794 header.transform_metadata.len()
1795 );
1796 }
1797 }
1798
1799 #[test]
1800 fn test_twitter_json_brotli_wins() {
1801 let data = std::fs::read(concat!(
1804 env!("CARGO_MANIFEST_DIR"),
1805 "/../../corpus/json-bench/twitter.json"
1806 ))
1807 .unwrap();
1808
1809 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1810 let decompressed = decompress_from_slice(&compressed).unwrap();
1811 assert_eq!(decompressed, data, "twitter.json roundtrip failed");
1812
1813 let mut cursor = Cursor::new(&compressed);
1815 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1816 assert!(
1817 header.use_brotli,
1818 "twitter.json should use brotli (FLAG_BROTLI set in header)"
1819 );
1820 }
1821
1822 #[test]
1823 fn test_compressed_metadata_all_modes_roundtrip() {
1824 let mut ndjson = String::new();
1826 for i in 0..200 {
1827 ndjson.push_str(&format!(
1828 r#"{{"id":{},"name":"user_{}","status":"active"}}"#,
1829 i, i
1830 ));
1831 ndjson.push('\n');
1832 }
1833 let data = ndjson.as_bytes();
1834
1835 for mode in [Mode::Fast, Mode::Balanced, Mode::Max] {
1836 let compressed = compress_to_vec(data, mode, Some(FormatHint::Ndjson)).unwrap();
1837 let decompressed = decompress_from_slice(&compressed).unwrap();
1838 assert_eq!(
1839 decompressed, data,
1840 "compressed metadata roundtrip failed for mode {mode}"
1841 );
1842 }
1843 }
1844
1845 #[test]
1848 fn test_brotli_compress_roundtrip() {
1849 let data = b"Hello, brotli! This is a test of the brotli compression helpers.";
1851 let compressed = brotli_compress(data, 11, BROTLI_MODE_GENERIC).unwrap();
1852 let decompressed = brotli_decompress(&compressed).unwrap();
1853 assert_eq!(decompressed, data.to_vec());
1854 }
1855
1856 #[test]
1857 fn test_brotli_auto_fallback_twitter() {
1858 let data = std::fs::read(concat!(
1860 env!("CARGO_MANIFEST_DIR"),
1861 "/../../corpus/json-bench/twitter.json"
1862 ))
1863 .unwrap();
1864
1865 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1866 let decompressed = decompress_from_slice(&compressed).unwrap();
1867 assert_eq!(decompressed, data, "twitter.json brotli roundtrip failed");
1868
1869 let mut cursor = Cursor::new(&compressed);
1870 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1871 assert!(
1872 header.use_brotli,
1873 "twitter.json should use brotli in auto-fallback"
1874 );
1875 }
1876
1877 #[test]
1878 fn test_brotli_ndjson_roundtrip() {
1879 let data = std::fs::read(concat!(
1882 env!("CARGO_MANIFEST_DIR"),
1883 "/../../corpus/test-ndjson.ndjson"
1884 ))
1885 .unwrap();
1886
1887 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1888 let decompressed = decompress_from_slice(&compressed).unwrap();
1889 assert_eq!(decompressed, data, "ndjson roundtrip failed");
1890 }
1891
1892 #[test]
1893 fn test_brotli_backward_compat() {
1894 let original = b"backward compatibility test: this data was compressed without brotli";
1898 let crc = crc32fast::hash(original);
1899 let zstd_compressed = zstd::bulk::compress(original, 19).unwrap();
1900
1901 let header = crate::dcx::DcxHeader {
1902 mode: Mode::Fast,
1903 format_hint: crate::dcx::FormatHint::Generic,
1904 original_size: original.len() as u64,
1905 compressed_size: zstd_compressed.len() as u64,
1906 crc32: crc,
1907 transform_metadata: vec![],
1908 has_dict: false,
1909 meta_compressed: false,
1910 use_brotli: false,
1911 meta_embedded: false,
1912 };
1913
1914 let mut buf = Vec::new();
1915 header.write_to(&mut buf).unwrap();
1916 buf.extend_from_slice(&zstd_compressed);
1917
1918 assert_eq!(buf[7] & crate::dcx::FLAG_BROTLI, 0);
1920
1921 let decompressed = decompress_from_slice(&buf).unwrap();
1923 assert_eq!(decompressed, original.to_vec());
1924 }
1925
1926 #[test]
1929 fn test_embedded_metadata_roundtrip() {
1930 let data = std::fs::read(concat!(
1933 env!("CARGO_MANIFEST_DIR"),
1934 "/../../corpus/test-api.json"
1935 ))
1936 .unwrap();
1937
1938 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1939 let decompressed = decompress_from_slice(&compressed).unwrap();
1940 assert_eq!(
1941 decompressed, data,
1942 "test-api.json embedded metadata roundtrip: byte-exact mismatch"
1943 );
1944 }
1945
1946 #[test]
1947 fn test_embedded_metadata_backward_compat() {
1948 let original = b"backward compat: no embedded metadata in this old file format";
1952 let crc = crc32fast::hash(original);
1953 let zstd_compressed = zstd::bulk::compress(original, 19).unwrap();
1954
1955 let header = crate::dcx::DcxHeader {
1956 mode: Mode::Fast,
1957 format_hint: crate::dcx::FormatHint::Generic,
1958 original_size: original.len() as u64,
1959 compressed_size: zstd_compressed.len() as u64,
1960 crc32: crc,
1961 transform_metadata: vec![],
1962 has_dict: false,
1963 meta_compressed: false,
1964 use_brotli: false,
1965 meta_embedded: false,
1966 };
1967
1968 let mut buf = Vec::new();
1969 header.write_to(&mut buf).unwrap();
1970 buf.extend_from_slice(&zstd_compressed);
1971
1972 assert_eq!(buf[7] & crate::dcx::FLAG_META_EMBEDDED, 0);
1974
1975 let decompressed = decompress_from_slice(&buf).unwrap();
1977 assert_eq!(decompressed, original.to_vec());
1978 }
1979
1980 #[test]
1981 fn test_embedded_metadata_small_file_improvement() {
1982 let data = std::fs::read(concat!(
1985 env!("CARGO_MANIFEST_DIR"),
1986 "/../../corpus/test-api.json"
1987 ))
1988 .unwrap();
1989
1990 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1991 let decompressed = decompress_from_slice(&compressed).unwrap();
1992 assert_eq!(decompressed, data, "roundtrip failed");
1993
1994 let ratio = data.len() as f64 / compressed.len() as f64;
1996 assert!(
1997 ratio > 5.0,
1998 "test-api.json should achieve >5x compression, got {ratio:.1}x"
1999 );
2000
2001 let mut cursor = Cursor::new(&compressed);
2003 let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
2004
2005 if header.meta_embedded {
2007 assert!(
2008 header.transform_metadata.is_empty(),
2009 "meta_embedded header should have empty transform_metadata"
2010 );
2011 assert!(header.use_brotli, "meta_embedded should use brotli codec");
2012 }
2013 }
2014
2015 #[test]
2016 fn test_embedded_metadata_ndjson_roundtrip() {
2017 let data = std::fs::read(concat!(
2020 env!("CARGO_MANIFEST_DIR"),
2021 "/../../corpus/test-ndjson.ndjson"
2022 ))
2023 .unwrap();
2024
2025 let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
2026 let decompressed = decompress_from_slice(&compressed).unwrap();
2027 assert_eq!(
2028 decompressed, data,
2029 "NDJSON embedded metadata roundtrip: byte-exact mismatch"
2030 );
2031 }
2032
2033 #[test]
2034 fn test_embedded_metadata_manual_roundtrip() {
2035 let original = b"Hello, embedded metadata world! This is a test.";
2038 let crc = crc32fast::hash(original);
2039
2040 let empty_chain = TransformChain::new();
2043 let raw_metadata = empty_chain.serialize();
2044
2045 let mut embedded = Vec::new();
2047 embedded.extend_from_slice(&(raw_metadata.len() as u32).to_le_bytes());
2048 embedded.extend_from_slice(&raw_metadata);
2049 embedded.extend_from_slice(original);
2050
2051 let brotli_data = brotli_compress(&embedded, 11, BROTLI_MODE_GENERIC).unwrap();
2052
2053 let header = crate::dcx::DcxHeader {
2054 mode: Mode::Fast,
2055 format_hint: crate::dcx::FormatHint::Generic,
2056 original_size: original.len() as u64,
2057 compressed_size: brotli_data.len() as u64,
2058 crc32: crc,
2059 transform_metadata: vec![], has_dict: false,
2061 meta_compressed: false,
2062 use_brotli: true,
2063 meta_embedded: true,
2064 };
2065
2066 let mut buf = Vec::new();
2067 header.write_to(&mut buf).unwrap();
2068 buf.extend_from_slice(&brotli_data);
2069
2070 assert_ne!(buf[7] & crate::dcx::FLAG_META_EMBEDDED, 0);
2072 assert_ne!(buf[7] & crate::dcx::FLAG_BROTLI, 0);
2073
2074 let decompressed = decompress_from_slice(&buf).unwrap();
2076 assert_eq!(decompressed, original.to_vec());
2077 }
2078
2079 #[test]
2082 fn test_brotli_text_mode_on_raw() {
2083 let data = br#"{"name":"Alice","age":30,"city":"New York","active":true}"#;
2085
2086 let compressed_text = brotli_compress(data, 11, BROTLI_MODE_TEXT).unwrap();
2088 let decompressed_text = brotli_decompress(&compressed_text).unwrap();
2089 assert_eq!(
2090 decompressed_text,
2091 data.to_vec(),
2092 "TEXT mode roundtrip failed"
2093 );
2094
2095 let compressed_generic = brotli_compress(data, 11, BROTLI_MODE_GENERIC).unwrap();
2097 let decompressed_generic = brotli_decompress(&compressed_generic).unwrap();
2098 assert_eq!(
2099 decompressed_generic,
2100 data.to_vec(),
2101 "GENERIC mode roundtrip failed"
2102 );
2103
2104 assert!(
2109 !compressed_text.is_empty(),
2110 "TEXT mode should produce non-empty output"
2111 );
2112 }
2113
2114 #[test]
2117 fn test_zstd_embedded_metadata_roundtrip() {
2118 let original = b"Hello, zstd embedded metadata! This is a test of the zstd path.";
2121 let crc = crc32fast::hash(original);
2122
2123 let empty_chain = TransformChain::new();
2125 let raw_metadata = empty_chain.serialize();
2126
2127 let mut embedded = Vec::new();
2129 embedded.extend_from_slice(&(raw_metadata.len() as u32).to_le_bytes());
2130 embedded.extend_from_slice(&raw_metadata);
2131 embedded.extend_from_slice(original);
2132
2133 let zstd_data = zstd::bulk::compress(&embedded, 19).unwrap();
2134
2135 let header = crate::dcx::DcxHeader {
2136 mode: Mode::Fast,
2137 format_hint: crate::dcx::FormatHint::Generic,
2138 original_size: original.len() as u64,
2139 compressed_size: zstd_data.len() as u64,
2140 crc32: crc,
2141 transform_metadata: vec![], has_dict: false,
2143 meta_compressed: false,
2144 use_brotli: false, meta_embedded: true,
2146 };
2147
2148 let mut buf = Vec::new();
2149 header.write_to(&mut buf).unwrap();
2150 buf.extend_from_slice(&zstd_data);
2151
2152 assert_ne!(buf[7] & crate::dcx::FLAG_META_EMBEDDED, 0);
2154 assert_eq!(buf[7] & crate::dcx::FLAG_BROTLI, 0);
2155
2156 let decompressed = decompress_from_slice(&buf).unwrap();
2158 assert_eq!(decompressed, original.to_vec());
2159 }
2160
2161 #[test]
2164 fn test_multi_quality_brotli() {
2165 let data = br#"{"items":[1,2,3,4,5],"nested":{"a":"hello","b":"world"}}"#;
2168
2169 let q10 = brotli_compress(data, 10, BROTLI_MODE_GENERIC).unwrap();
2170 let q11 = brotli_compress(data, 11, BROTLI_MODE_GENERIC).unwrap();
2171
2172 let dec_q10 = brotli_decompress(&q10).unwrap();
2173 let dec_q11 = brotli_decompress(&q11).unwrap();
2174
2175 assert_eq!(dec_q10, data.to_vec(), "quality 10 roundtrip failed");
2176 assert_eq!(dec_q11, data.to_vec(), "quality 11 roundtrip failed");
2177
2178 assert!(!q10.is_empty());
2180 assert!(!q11.is_empty());
2181
2182 let corpus_files = [
2186 concat!(env!("CARGO_MANIFEST_DIR"), "/../../corpus/test-api.json"),
2187 concat!(
2188 env!("CARGO_MANIFEST_DIR"),
2189 "/../../corpus/json-bench/twitter.json"
2190 ),
2191 ];
2192 for path in corpus_files {
2193 let file_data = std::fs::read(path).unwrap();
2194 let compressed =
2195 compress_to_vec(&file_data, Mode::Fast, Some(FormatHint::Json)).unwrap();
2196 let decompressed = decompress_from_slice(&compressed).unwrap();
2197 assert_eq!(
2198 decompressed, file_data,
2199 "multi-quality roundtrip failed for {path}"
2200 );
2201 }
2202 }
2203}