datacortex_core/
codec.rs

1//! Codec orchestrator — compress and decompress through the DataCortex pipeline.
2//!
3//! Phase 1: Format preprocessing + zstd (Fast mode).
4//! Phase 3: Full CM engine with higher-order models + mixer + APM (Balanced mode, ~256MB).
5//! Phase 5: Full CM engine with 2x context maps (Max mode, ~512MB).
6//! Phase 6: Dual-path CM + LLM with MetaMixer (Max mode with `neural` feature).
7//! Phase 7: Dual-path CM + GRU byte-level predictor (Balanced mode).
8
9use std::io::{self, Cursor, Read, Write};
10
11use crate::dcx::{DcxHeader, FormatHint, Mode};
12use crate::entropy::arithmetic::{ArithmeticDecoder, ArithmeticEncoder};
13use crate::format::transform::TransformChain;
14use crate::format::{detect_format, preprocess, reverse_preprocess};
15use crate::mixer::MetaMixer;
16use crate::model::gru_model::GruModel;
17use crate::model::{CMConfig, CMEngine};
18
19/// Adaptive zstd level for Fast mode based on preprocessed data size.
20///
21/// Smaller data compresses quickly even at high levels, so we use higher
22/// zstd levels for small-medium files without meaningful speed impact.
23/// If `level_override` is set (user passed --level), it always wins.
24fn adaptive_fast_level(data_size: usize, level_override: Option<i32>) -> i32 {
25    if let Some(level) = level_override {
26        return level; // User explicitly set level, respect it
27    }
28    match data_size {
29        0..=1_048_576 => 19,          // <1MB: zstd-19 is <50ms, use best ratio
30        1_048_577..=10_485_760 => 13, // 1MB-10MB: good balance
31        _ => 9,                       // >10MB: use level 9 for speed
32    }
33}
34
35// ─── Zstd Dictionary Training (Fast mode) ─────────────────────────────────────
36
37/// Minimum preprocessed data size to attempt dictionary training.
38/// Below this threshold the dictionary overhead exceeds any savings.
39const DICT_MIN_DATA_SIZE: usize = 8192;
40
41/// Target chunk size for splitting preprocessed data before per-chunk compression.
42/// Each chunk is compressed independently with the shared dictionary.
43/// Smaller chunks benefit more from dictionary priming, but each chunk has
44/// framing overhead (4 bytes size + zstd frame header ~10 bytes).
45/// Adaptive: scale with data size to avoid too many chunks.
46fn dict_chunk_size(data_len: usize) -> usize {
47    if data_len > 4_194_304 {
48        131_072 // 128 KB for > 4 MB
49    } else if data_len > 1_048_576 {
50        65_536 // 64 KB for 1 - 4 MB
51    } else if data_len > 262_144 {
52        32_768 // 32 KB for 256 KB - 1 MB
53    } else {
54        16_384 // 16 KB for smaller files
55    }
56}
57
58/// Maximum dictionary size based on input data size.
59/// Kept relatively small to minimize overhead. The dictionary primes each chunk's
60/// compressor context, so even a small dict provides most of the benefit.
61fn dict_max_size(data_len: usize) -> usize {
62    if data_len > 4_194_304 {
63        16_384 // 16 KB for > 4 MB
64    } else if data_len > 1_048_576 {
65        8_192 // 8 KB for 1 - 4 MB
66    } else {
67        4_096 // 4 KB for smaller files
68    }
69}
70
71/// Generate training samples from the data for dictionary training.
72///
73/// Uses column boundaries (0x00 separators) if available, otherwise fixed blocks.
74/// These samples are only used for `zstd::dict::from_samples`, NOT for the
75/// actual chunked compression (which uses `split_into_chunks`).
76fn generate_training_samples(data: &[u8], chunk_size: usize) -> Vec<&[u8]> {
77    // Try column boundaries (0x00 separators from columnar transform).
78    let col_chunks: Vec<&[u8]> = data.split(|&b| b == 0x00).collect();
79    if col_chunks.len() >= 5 {
80        // Filter out empty chunks and return.
81        return col_chunks.into_iter().filter(|c| !c.is_empty()).collect();
82    }
83
84    // Fall back to fixed-size blocks for training.
85    split_into_chunks(data, chunk_size)
86}
87
88/// Split data into fixed-size chunks for per-chunk compression.
89/// Every byte is preserved exactly -- no bytes are lost at boundaries.
90fn split_into_chunks(data: &[u8], chunk_size: usize) -> Vec<&[u8]> {
91    let mut chunks = Vec::new();
92    let mut offset = 0;
93    while offset < data.len() {
94        let end = (offset + chunk_size).min(data.len());
95        chunks.push(&data[offset..end]);
96        offset = end;
97    }
98    chunks
99}
100
101/// Attempt chunk-based dictionary compression.
102///
103/// 1. Split data into chunks
104/// 2. Train a zstd dictionary on the chunks
105/// 3. Compress each chunk independently using the trained dictionary
106/// 4. Return the dict + all compressed chunks as a payload
107///
108/// Returns `Some(payload)` if the total is smaller than `plain_size`, else `None`.
109fn try_dict_compress(data: &[u8], level: i32, plain_size: usize) -> Option<Vec<u8>> {
110    let chunk_size = dict_chunk_size(data.len());
111
112    // Generate training samples (may use column boundaries for better diversity).
113    let training_samples = generate_training_samples(data, chunk_size);
114    if training_samples.len() < 5 {
115        return None;
116    }
117
118    let max_dict = dict_max_size(data.len());
119
120    // Train dictionary from the training samples.
121    let dict = zstd::dict::from_samples(&training_samples, max_dict).ok()?;
122    if dict.is_empty() {
123        return None;
124    }
125
126    // Split data into fixed-size chunks for per-chunk compression.
127    let chunks = split_into_chunks(data, chunk_size);
128
129    // Compress each chunk independently with the dictionary.
130    let mut compressor = zstd::bulk::Compressor::with_dictionary(level, &dict).ok()?;
131    let mut compressed_chunks: Vec<Vec<u8>> = Vec::with_capacity(chunks.len());
132    for chunk in &chunks {
133        let cc = compressor.compress(chunk).ok()?;
134        compressed_chunks.push(cc);
135    }
136
137    // Build payload:
138    //   [dict_size: u32 LE] [dict_bytes]
139    //   [num_chunks: u32 LE]
140    //   for each chunk: [chunk_compressed_size: u32 LE] [chunk_data]
141    let total_compressed: usize = compressed_chunks.iter().map(|c| 4 + c.len()).sum();
142    let payload_size = 4 + dict.len() + 4 + total_compressed;
143
144    // Only use dict if it beats plain compression.
145    if payload_size >= plain_size {
146        return None;
147    }
148
149    let mut payload = Vec::with_capacity(payload_size);
150    payload.extend_from_slice(&(dict.len() as u32).to_le_bytes());
151    payload.extend_from_slice(&dict);
152    payload.extend_from_slice(&(compressed_chunks.len() as u32).to_le_bytes());
153    for cc in &compressed_chunks {
154        payload.extend_from_slice(&(cc.len() as u32).to_le_bytes());
155        payload.extend_from_slice(cc);
156    }
157
158    Some(payload)
159}
160
161/// Decompress a chunk-based dictionary-compressed payload.
162///
163/// Payload format:
164///   [dict_size: u32 LE] [dict_bytes]
165///   [num_chunks: u32 LE]
166///   for each chunk: [chunk_compressed_size: u32 LE] [chunk_data]
167///
168/// Chunks are decompressed individually and concatenated.
169fn decompress_with_dict(payload: &[u8], capacity: usize) -> std::io::Result<Vec<u8>> {
170    if payload.len() < 4 {
171        return Err(io::Error::new(
172            io::ErrorKind::InvalidData,
173            "dict payload too short for dict_size",
174        ));
175    }
176    let mut pos = 0;
177
178    // Read dictionary.
179    let dict_size =
180        u32::from_le_bytes(payload[pos..pos + 4].try_into().expect("4-byte slice")) as usize;
181    pos += 4;
182    if payload.len() < pos + dict_size {
183        return Err(io::Error::new(
184            io::ErrorKind::InvalidData,
185            "dict payload truncated: dictionary bytes",
186        ));
187    }
188    let dict_bytes = &payload[pos..pos + dict_size];
189    pos += dict_size;
190
191    // Read number of chunks.
192    if payload.len() < pos + 4 {
193        return Err(io::Error::new(
194            io::ErrorKind::InvalidData,
195            "dict payload truncated: num_chunks",
196        ));
197    }
198    let num_chunks =
199        u32::from_le_bytes(payload[pos..pos + 4].try_into().expect("4-byte slice")) as usize;
200    pos += 4;
201
202    // Prepare decompressor with dictionary.
203    let mut decompressor = zstd::bulk::Decompressor::with_dictionary(dict_bytes)
204        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
205
206    let mut output = Vec::with_capacity(capacity);
207
208    for i in 0..num_chunks {
209        if payload.len() < pos + 4 {
210            return Err(io::Error::new(
211                io::ErrorKind::InvalidData,
212                format!("dict payload truncated: chunk {i} size"),
213            ));
214        }
215        let chunk_size =
216            u32::from_le_bytes(payload[pos..pos + 4].try_into().expect("4-byte slice")) as usize;
217        pos += 4;
218        if payload.len() < pos + chunk_size {
219            return Err(io::Error::new(
220                io::ErrorKind::InvalidData,
221                format!("dict payload truncated: chunk {i} data"),
222            ));
223        }
224        let chunk_data = &payload[pos..pos + chunk_size];
225        pos += chunk_size;
226
227        // Each chunk decompresses to at most chunk_size + some headroom.
228        let chunk_capacity = capacity.saturating_sub(output.len());
229        let decompressed = decompressor
230            .decompress(chunk_data, chunk_capacity)
231            .map_err(|e| {
232                io::Error::new(
233                    io::ErrorKind::InvalidData,
234                    format!("chunk {i} decompress failed: {e}"),
235                )
236            })?;
237        output.extend_from_slice(&decompressed);
238    }
239
240    Ok(output)
241}
242
243// ─── Brotli helpers (Fast mode auto-fallback) ─────────────────────────────────
244
245/// Brotli mode constants for `brotli_compress`.
246/// GENERIC (0): default, best for binary/preprocessed data.
247/// TEXT (1): optimized for UTF-8 text, better for raw JSON.
248const BROTLI_MODE_GENERIC: u32 = 0;
249const BROTLI_MODE_TEXT: u32 = 1;
250
251/// Compress `data` with brotli at the given quality (0-11) and mode.
252/// Use `BROTLI_MODE_TEXT` for raw UTF-8/JSON, `BROTLI_MODE_GENERIC` for preprocessed data.
253fn brotli_compress(data: &[u8], quality: u32, mode: u32) -> io::Result<Vec<u8>> {
254    use brotli::enc::backward_references::BrotliEncoderMode;
255    let mut output = Vec::new();
256    let brotli_mode = match mode {
257        1 => BrotliEncoderMode::BROTLI_MODE_TEXT,
258        _ => BrotliEncoderMode::BROTLI_MODE_GENERIC,
259    };
260    let params = brotli::enc::BrotliEncoderParams {
261        quality: quality as i32,
262        mode: brotli_mode,
263        ..Default::default()
264    };
265    brotli::BrotliCompress(&mut io::Cursor::new(data), &mut output, &params)?;
266    Ok(output)
267}
268
269/// Decompress a brotli stream. `max_size` is a capacity hint for the output buffer.
270fn brotli_decompress(data: &[u8]) -> io::Result<Vec<u8>> {
271    let mut output = Vec::new();
272    brotli::BrotliDecompress(&mut io::Cursor::new(data), &mut output)?;
273    Ok(output)
274}
275
276/// Compress data using the CM engine with the given configuration.
277/// Returns the compressed byte stream.
278fn cm_compress(data: &[u8], config: CMConfig) -> Vec<u8> {
279    let mut engine = CMEngine::with_config(config);
280    let mut encoder = ArithmeticEncoder::new();
281
282    for &byte in data {
283        for bpos in 0..8 {
284            let bit = (byte >> (7 - bpos)) & 1;
285            let p = engine.predict();
286            encoder.encode(bit, p);
287            engine.update(bit);
288        }
289    }
290
291    encoder.finish()
292}
293
294/// Decompress data using the CM engine with the given configuration.
295/// `compressed` is the arithmetic-coded stream, `original_size` is the expected output length.
296fn cm_decompress(compressed: &[u8], original_size: usize, config: CMConfig) -> Vec<u8> {
297    let mut engine = CMEngine::with_config(config);
298    let mut decoder = ArithmeticDecoder::new(compressed);
299    let mut output = Vec::with_capacity(original_size);
300
301    for _ in 0..original_size {
302        let mut byte_val: u8 = 0;
303        for bpos in 0..8 {
304            let p = engine.predict();
305            let bit = decoder.decode(p);
306            engine.update(bit);
307            byte_val |= bit << (7 - bpos);
308        }
309        output.push(byte_val);
310    }
311
312    output
313}
314
315// ─── GRU dual-path (CM + GRU byte predictor) ────────────────────────────────
316// The GRU provides a DIFFERENT signal from CM: byte-level cross-bit correlations.
317// It's blended AFTER the full CM pipeline via MetaMixer.
318// CRITICAL: encoder and decoder must produce IDENTICAL GRU + CM state.
319
320/// Compress using dual-path: CM engine + GRU byte predictor + MetaMixer.
321/// Used for Balanced mode.
322fn gru_compress(data: &[u8], config: CMConfig) -> Vec<u8> {
323    let mut engine = CMEngine::with_config(config);
324    let mut gru = GruModel::new();
325    let mut meta_mixer = MetaMixer::new(12); // 12% GRU weight
326    let mut encoder = ArithmeticEncoder::new();
327
328    let total_bytes = data.len();
329    let report_interval = if total_bytes > 100_000 {
330        total_bytes / 20
331    } else {
332        0
333    };
334
335    for (byte_idx, &byte) in data.iter().enumerate() {
336        for bpos in 0..8u8 {
337            let bit = (byte >> (7 - bpos)) & 1;
338
339            // CM prediction (full pipeline: 19 models + mixer + 7 APM).
340            let p_cm = engine.predict();
341
342            // GRU bit prediction from cached byte probs.
343            let partial = if bpos == 0 {
344                1u32
345            } else {
346                let mut p = 1u32;
347                for prev_bpos in 0..bpos {
348                    let prev_bit = (byte >> (7 - prev_bpos)) & 1;
349                    p = (p << 1) | prev_bit as u32;
350                }
351                p
352            };
353            let p_gru = gru.predict_bit(bpos, partial);
354
355            // MetaMixer blend.
356            let p_final = meta_mixer.blend(p_cm, p_gru);
357
358            encoder.encode(bit, p_final);
359            engine.update(bit);
360            meta_mixer.update(bit);
361        }
362
363        // Byte complete: train GRU on observed byte, then forward for next prediction.
364        gru.train(byte);
365        gru.forward(byte);
366
367        if report_interval > 0 && (byte_idx + 1) % report_interval == 0 {
368            let pct = (byte_idx + 1) * 100 / total_bytes;
369            eprint!("\r[gru] compressing... {pct}%");
370        }
371    }
372
373    if total_bytes > 100_000 {
374        eprintln!("\r[gru] compressing... 100%");
375    }
376
377    encoder.finish()
378}
379
380/// Decompress using dual-path: CM engine + GRU byte predictor + MetaMixer.
381/// Must produce IDENTICAL GRU + CM state as the encoder.
382fn gru_decompress(compressed: &[u8], original_size: usize, config: CMConfig) -> Vec<u8> {
383    let mut engine = CMEngine::with_config(config);
384    let mut gru = GruModel::new();
385    let mut meta_mixer = MetaMixer::new(12); // same 12% as encoder
386    let mut decoder = ArithmeticDecoder::new(compressed);
387    let mut output = Vec::with_capacity(original_size);
388
389    let report_interval = if original_size > 100_000 {
390        original_size / 20
391    } else {
392        0
393    };
394
395    for byte_idx in 0..original_size {
396        let mut byte_val: u8 = 0;
397
398        for bpos in 0..8u8 {
399            // CM prediction.
400            let p_cm = engine.predict();
401
402            // GRU bit prediction (same partial byte state as encoder).
403            let partial = if bpos == 0 {
404                1u32
405            } else {
406                let mut p = 1u32;
407                for prev_bpos in 0..bpos {
408                    let prev_bit = (byte_val >> (7 - prev_bpos)) & 1;
409                    p = (p << 1) | prev_bit as u32;
410                }
411                p
412            };
413            let p_gru = gru.predict_bit(bpos, partial);
414
415            // MetaMixer blend.
416            let p_final = meta_mixer.blend(p_cm, p_gru);
417
418            let bit = decoder.decode(p_final);
419            engine.update(bit);
420            meta_mixer.update(bit);
421            byte_val |= bit << (7 - bpos);
422        }
423
424        output.push(byte_val);
425
426        // Byte complete: train GRU then forward (same as encoder).
427        gru.train(byte_val);
428        gru.forward(byte_val);
429
430        if report_interval > 0 && (byte_idx + 1) % report_interval == 0 {
431            let pct = (byte_idx + 1) * 100 / original_size;
432            eprint!("\r[gru] decompressing... {pct}%");
433        }
434    }
435
436    if original_size > 100_000 {
437        eprintln!("\r[gru] decompressing... 100%");
438    }
439
440    output
441}
442
443// ─── Neural dual-path (CM + LLM) ─────────────────────────────────────────────
444// Feature-gated: only available when `neural` is enabled.
445// The LLM predictor runs alongside the CM engine. A MetaMixer blends them.
446// CRITICAL: encoder and decoder must produce IDENTICAL LLM + CM state.
447
448/// Compress using dual-path: CM engine + LLM predictor + MetaMixer.
449/// Only used for Max mode with neural feature enabled.
450#[cfg(feature = "neural")]
451fn neural_compress(
452    data: &[u8],
453    config: CMConfig,
454    llm: &mut datacortex_neural::LlmPredictor,
455    meta_mixer: &mut datacortex_neural::MetaMixer,
456) -> Vec<u8> {
457    let mut engine = CMEngine::with_config(config);
458    let mut encoder = ArithmeticEncoder::new();
459
460    // For the first byte, LLM has no context. Feed a zero byte to prime it.
461    // We need the LLM to have predicted byte probs BEFORE we start encoding.
462    // Strategy: process byte-by-byte. After encoding byte N, feed byte N to LLM
463    // to get predictions for byte N+1.
464
465    let total_bytes = data.len();
466    let mut bytes_processed = 0;
467    let report_interval = total_bytes / 20; // Report every 5%.
468
469    for (byte_idx, &byte) in data.iter().enumerate() {
470        // At this point, LLM has been fed bytes 0..byte_idx-1.
471        // LLM's cached_byte_probs predict byte_idx.
472
473        for bpos in 0..8u8 {
474            let bit = (byte >> (7 - bpos)) & 1;
475
476            // CM prediction.
477            let p_cm = engine.predict();
478
479            // LLM bit prediction.
480            // c0 is the partial byte being built: starts at 1, accumulates bits.
481            let partial = if bpos == 0 {
482                1u32
483            } else {
484                // Build partial from the bits we've already encoded for this byte.
485                let mut p = 1u32;
486                for prev_bpos in 0..bpos {
487                    let prev_bit = (byte >> (7 - prev_bpos)) & 1;
488                    p = (p << 1) | prev_bit as u32;
489                }
490                p
491            };
492            let p_llm = llm.predict_bit(bpos, partial);
493
494            // Meta-mixer blend.
495            let p_final = meta_mixer.blend(p_cm, p_llm);
496
497            encoder.encode(bit, p_final);
498            engine.update(bit);
499            meta_mixer.update(bit);
500        }
501
502        // Feed the completed byte to the LLM for next-byte prediction.
503        if let Err(e) = llm.predict_byte_probs(byte) {
504            // If LLM fails, it will return uniform on next call. Log but don't abort.
505            if byte_idx < 5 {
506                eprintln!("[neural] LLM predict error at byte {byte_idx}: {e}");
507            }
508        }
509
510        bytes_processed += 1;
511        if report_interval > 0 && bytes_processed % report_interval == 0 {
512            let pct = bytes_processed * 100 / total_bytes;
513            eprint!("\r[neural] compressing... {pct}%");
514        }
515    }
516
517    if total_bytes > 1000 {
518        eprintln!("\r[neural] compressing... 100%");
519    }
520
521    encoder.finish()
522}
523
524/// Decompress using dual-path: CM engine + LLM predictor + MetaMixer.
525/// Must produce IDENTICAL LLM + CM state as the encoder.
526#[cfg(feature = "neural")]
527fn neural_decompress(
528    compressed: &[u8],
529    original_size: usize,
530    config: CMConfig,
531    llm: &mut datacortex_neural::LlmPredictor,
532    meta_mixer: &mut datacortex_neural::MetaMixer,
533) -> Vec<u8> {
534    let mut engine = CMEngine::with_config(config);
535    let mut decoder = ArithmeticDecoder::new(compressed);
536    let mut output = Vec::with_capacity(original_size);
537
538    let report_interval = if original_size > 0 {
539        original_size / 20
540    } else {
541        1
542    };
543
544    for byte_idx in 0..original_size {
545        let mut byte_val: u8 = 0;
546
547        for bpos in 0..8u8 {
548            // CM prediction.
549            let p_cm = engine.predict();
550
551            // LLM bit prediction (using same partial byte state as encoder).
552            let partial = if bpos == 0 {
553                1u32
554            } else {
555                // Build partial from bits already decoded for this byte.
556                let mut p = 1u32;
557                for prev_bpos in 0..bpos {
558                    let prev_bit = (byte_val >> (7 - prev_bpos)) & 1;
559                    p = (p << 1) | prev_bit as u32;
560                }
561                p
562            };
563            let p_llm = llm.predict_bit(bpos, partial);
564
565            // Meta-mixer blend.
566            let p_final = meta_mixer.blend(p_cm, p_llm);
567
568            let bit = decoder.decode(p_final);
569            engine.update(bit);
570            meta_mixer.update(bit);
571            byte_val |= bit << (7 - bpos);
572        }
573
574        output.push(byte_val);
575
576        // Feed decoded byte to LLM (same as encoder did).
577        if let Err(e) = llm.predict_byte_probs(byte_val) {
578            if byte_idx < 5 {
579                eprintln!("[neural] LLM predict error at byte {byte_idx}: {e}");
580            }
581        }
582
583        if report_interval > 0 && (byte_idx + 1) % report_interval == 0 {
584            let pct = (byte_idx + 1) * 100 / original_size;
585            eprint!("\r[neural] decompressing... {pct}%");
586        }
587    }
588
589    if original_size > 1000 {
590        eprintln!("\r[neural] decompressing... 100%");
591    }
592
593    output
594}
595
596/// Get the CMConfig for a given mode.
597fn cm_config_for_mode(mode: Mode) -> CMConfig {
598    match mode {
599        Mode::Max => CMConfig::max(),
600        Mode::Balanced => CMConfig::balanced(),
601        Mode::Fast => CMConfig::balanced(), // not used for Fast, but keeps API clean
602    }
603}
604
605/// Resolve the model path from:
606/// 1. Explicit path (--model-path CLI flag)
607/// 2. DATACORTEX_MODEL environment variable
608/// 3. Default: ~/.datacortex/models/SmolLM2-135M-Instruct-Q8_0.gguf
609#[cfg(feature = "neural")]
610fn resolve_model_path(explicit: Option<&str>) -> Option<String> {
611    if let Some(p) = explicit {
612        if std::path::Path::new(p).exists() {
613            return Some(p.to_string());
614        }
615        eprintln!("[neural] explicit model path not found: {p}");
616        return None;
617    }
618
619    if let Ok(p) = std::env::var("DATACORTEX_MODEL") {
620        if p.is_empty() {
621            // Explicitly set to empty = disable neural.
622            return None;
623        }
624        if std::path::Path::new(&p).exists() {
625            return Some(p);
626        }
627        eprintln!("[neural] DATACORTEX_MODEL path not found: {p}");
628        return None; // Don't fall through to default.
629    }
630
631    // Default location.
632    if let Some(home) = std::env::var_os("HOME") {
633        let default = format!(
634            "{}/.datacortex/models/SmolLM2-135M-Instruct-Q8_0.gguf",
635            home.to_string_lossy()
636        );
637        if std::path::Path::new(&default).exists() {
638            return Some(default);
639        }
640    }
641
642    None
643}
644
645/// Compress `data` into .dcx format, writing to `output`.
646pub fn compress<W: Write>(
647    data: &[u8],
648    mode: Mode,
649    format_override: Option<FormatHint>,
650    output: &mut W,
651) -> io::Result<()> {
652    compress_with_model(data, mode, format_override, None, output)
653}
654
655/// Compress with optional explicit model path (for neural Max mode).
656pub fn compress_with_model<W: Write>(
657    data: &[u8],
658    mode: Mode,
659    format_override: Option<FormatHint>,
660    model_path: Option<&str>,
661    output: &mut W,
662) -> io::Result<()> {
663    compress_with_options(data, mode, format_override, model_path, None, output)
664}
665
666/// Compress with optional explicit model path and zstd level override.
667pub fn compress_with_options<W: Write>(
668    data: &[u8],
669    mode: Mode,
670    format_override: Option<FormatHint>,
671    model_path: Option<&str>,
672    zstd_level_override: Option<i32>,
673    output: &mut W,
674) -> io::Result<()> {
675    let format_hint = format_override.unwrap_or_else(|| detect_format(data));
676    let crc = crc32fast::hash(data);
677
678    // Step 1: Format-aware preprocessing.
679    let (preprocessed, chain) = preprocess(data, format_hint, mode);
680    let transform_metadata = if chain.is_empty() {
681        vec![]
682    } else {
683        chain.serialize()
684    };
685
686    // Step 2: Compress with engine.
687    let mut use_dict = false;
688    let mut use_brotli = false;
689    // Track whether raw fallback won (empty transform chain).
690    let mut use_raw_fallback = false;
691    // Track whether metadata is embedded in the compressed stream.
692    let mut use_meta_embedded = false;
693    let compressed = match mode {
694        // Fast mode: auto-fallback — try preprocessed+zstd, raw+zstd, raw+brotli,
695        // preprocessed+brotli, and embedded-metadata+brotli. Keep whichever produces
696        // the smallest output (including header and metadata overhead).
697        //
698        // Preprocessing (columnar + typed encoding) usually helps zstd by grouping
699        // similar values. But for some files (e.g. citm_catalog.json with extreme
700        // repetition), raw zstd without preprocessing gives MUCH better results
701        // because preprocessing removes the repetition patterns zstd's LZ77 exploits.
702        //
703        // Brotli at quality 11 can beat zstd on some JSON files (e.g. twitter.json)
704        // because its context modeling handles certain data patterns better.
705        //
706        // For small files with transforms, embedding metadata inside the brotli stream
707        // saves the separate metadata overhead (~150 bytes), because brotli compresses
708        // the 4-byte length prefix + raw metadata nearly for free.
709        Mode::Fast => {
710            let level = adaptive_fast_level(preprocessed.len(), zstd_level_override);
711
712            // Path A: preprocessed + zstd (with optional dict).
713            let plain_a = zstd::bulk::compress(&preprocessed, level).map_err(io::Error::other)?;
714
715            let (compressed_a, dict_a) = if preprocessed.len() >= DICT_MIN_DATA_SIZE {
716                if let Some(dict_payload) = try_dict_compress(&preprocessed, level, plain_a.len()) {
717                    (dict_payload, true)
718                } else {
719                    (plain_a, false)
720                }
721            } else {
722                (plain_a, false)
723            };
724
725            // Estimate compressed metadata size for fair comparison.
726            // This matches the compression that happens later for the header.
727            let meta_size_for_comparison = if transform_metadata.len() > 64 {
728                let compressed_meta = zstd::bulk::compress(&transform_metadata, 19)
729                    .unwrap_or_else(|_| transform_metadata.clone());
730                if compressed_meta.len() < transform_metadata.len() {
731                    compressed_meta.len()
732                } else {
733                    transform_metadata.len()
734                }
735            } else {
736                transform_metadata.len()
737            };
738
739            // Total size for Path A: header(32) + (compressed) metadata + compressed_a.
740            let total_a = 32 + meta_size_for_comparison + compressed_a.len();
741
742            // Path B: raw zstd (no preprocessing, no dict).
743            // Use same adaptive level but on original data size.
744            let raw_level = adaptive_fast_level(data.len(), zstd_level_override);
745            let compressed_b = zstd::bulk::compress(data, raw_level).map_err(io::Error::other)?;
746
747            // Total size for Path B: header(32) + 0 (empty metadata) + compressed_b.
748            let total_b = 32 + compressed_b.len();
749
750            // Start with best of zstd paths.
751            let (
752                mut best_compressed,
753                mut best_total,
754                mut best_dict,
755                mut best_raw,
756                mut best_brotli,
757                mut best_embedded,
758            ) = if total_b < total_a {
759                (compressed_b, total_b, false, true, false, false)
760            } else {
761                (compressed_a, total_a, dict_a, false, false, false)
762            };
763
764            // Path C: raw + brotli (TEXT mode — raw JSON is UTF-8 text).
765            // Use quality 11 for files <= 1MB, 9 for larger (speed tradeoff).
766            let brotli_quality = if data.len() <= 1_048_576 { 11 } else { 9 };
767            if let Ok(brotli_raw) = brotli_compress(data, brotli_quality, BROTLI_MODE_TEXT) {
768                let brotli_raw_total = 32 + brotli_raw.len();
769                if brotli_raw_total < best_total {
770                    best_compressed = brotli_raw;
771                    best_total = brotli_raw_total;
772                    best_dict = false;
773                    best_raw = true;
774                    best_brotli = true;
775                    best_embedded = false;
776                }
777            }
778
779            // Path D: preprocessed + brotli (separate metadata, GENERIC mode).
780            // Try both quality 11 and 10 — sometimes q10 is smaller on binary-mixed data.
781            {
782                let brotli_prep_max_q = if preprocessed.len() <= 1_048_576 {
783                    11
784                } else {
785                    9
786                };
787                let qualities = if brotli_prep_max_q == 11 {
788                    &[11u32, 10][..]
789                } else {
790                    &[brotli_prep_max_q as u32][..]
791                };
792                for &q in qualities {
793                    if let Ok(brotli_prep) = brotli_compress(&preprocessed, q, BROTLI_MODE_GENERIC)
794                    {
795                        let brotli_prep_total = 32 + meta_size_for_comparison + brotli_prep.len();
796                        if brotli_prep_total < best_total {
797                            best_compressed = brotli_prep;
798                            best_total = brotli_prep_total;
799                            best_dict = false;
800                            best_raw = false;
801                            best_brotli = true;
802                            best_embedded = false;
803                        }
804                    }
805                }
806            }
807
808            // Build the embedded metadata payload once for Paths E and F.
809            let embedded_payload = if !transform_metadata.is_empty() {
810                let mut ep = Vec::with_capacity(4 + transform_metadata.len() + preprocessed.len());
811                ep.extend_from_slice(&(transform_metadata.len() as u32).to_le_bytes());
812                ep.extend_from_slice(&transform_metadata);
813                ep.extend_from_slice(&preprocessed);
814                Some(ep)
815            } else {
816                None
817            };
818
819            // Path E: preprocessed + brotli with EMBEDDED metadata (GENERIC mode).
820            // Only attempt when transforms were applied (non-empty metadata).
821            // The metadata is prepended to the preprocessed data as:
822            //   [meta_len: u32 LE][raw_metadata][preprocessed_data]
823            // and the whole thing is brotli-compressed together. This eliminates
824            // separate metadata overhead in the header.
825            // Try both quality 11 and 10 (same dual-quality as Path D).
826            if let Some(ref embedded_payload) = embedded_payload {
827                let embed_max_q = if embedded_payload.len() <= 1_048_576 {
828                    11
829                } else {
830                    9
831                };
832                let qualities = if embed_max_q == 11 {
833                    &[11u32, 10][..]
834                } else {
835                    &[embed_max_q as u32][..]
836                };
837                for &q in qualities {
838                    if let Ok(brotli_embedded) =
839                        brotli_compress(embedded_payload, q, BROTLI_MODE_GENERIC)
840                    {
841                        // Total: header(32) + 0 (no separate metadata) + brotli_embedded.
842                        let brotli_embedded_total = 32 + brotli_embedded.len();
843                        if brotli_embedded_total < best_total {
844                            best_compressed = brotli_embedded;
845                            best_total = brotli_embedded_total;
846                            best_dict = false;
847                            best_raw = false;
848                            best_brotli = true;
849                            best_embedded = true;
850                        }
851                    }
852                }
853            }
854
855            // Path F: preprocessed + zstd with EMBEDDED metadata (no dict).
856            // When transforms were applied and dict was NOT the winner, try
857            // compressing [meta_len:u32][raw_metadata][preprocessed_data] with zstd.
858            // IMPORTANT: do NOT embed metadata when dict compression is active —
859            // the dict format has its own layout (dict + chunks) and embedding
860            // metadata would break it.
861            if let Some(ref embedded_payload) = embedded_payload {
862                let embed_level = adaptive_fast_level(embedded_payload.len(), zstd_level_override);
863                if let Ok(zstd_embedded) = zstd::bulk::compress(embedded_payload, embed_level) {
864                    // Total: header(32) + 0 (no separate metadata) + zstd_embedded.
865                    let zstd_embedded_total = 32 + zstd_embedded.len();
866                    if zstd_embedded_total < best_total {
867                        best_compressed = zstd_embedded;
868                        best_total = zstd_embedded_total;
869                        best_dict = false;
870                        best_raw = false;
871                        best_brotli = false;
872                        best_embedded = true;
873                    }
874                }
875            }
876
877            let _ = best_total; // used only for comparisons
878            use_dict = best_dict;
879            use_raw_fallback = best_raw;
880            use_brotli = best_brotli;
881            use_meta_embedded = best_embedded;
882            best_compressed
883        }
884        // Balanced mode: dual-path CM + GRU byte predictor.
885        Mode::Balanced => {
886            let config = cm_config_for_mode(mode);
887            let cm_data = gru_compress(&preprocessed, config);
888            let mut payload = Vec::with_capacity(8 + cm_data.len());
889            payload.extend_from_slice(&(preprocessed.len() as u64).to_le_bytes());
890            payload.extend_from_slice(&cm_data);
891            payload
892        }
893        // Max mode: try neural dual-path, fall back to CM-only.
894        Mode::Max => {
895            let config = cm_config_for_mode(mode);
896
897            #[cfg(feature = "neural")]
898            {
899                if let Some(mpath) = resolve_model_path(model_path) {
900                    match datacortex_neural::LlmPredictor::new(&mpath) {
901                        Ok(mut llm) => {
902                            let mut meta_mixer = datacortex_neural::MetaMixer::new(5);
903                            eprintln!(
904                                "[neural] Max mode: dual-path CM+LLM ({} bytes mapped)",
905                                llm.mapped_bytes()
906                            );
907                            let cm_data =
908                                neural_compress(&preprocessed, config, &mut llm, &mut meta_mixer);
909                            let mut payload = Vec::with_capacity(8 + cm_data.len());
910                            // Byte 0 of the 8-byte size prefix: set bit 7 to flag neural mode.
911                            // This lets the decompressor know to use neural path.
912                            let size_with_flag = preprocessed.len() as u64 | (1u64 << 63);
913                            payload.extend_from_slice(&size_with_flag.to_le_bytes());
914                            payload.extend_from_slice(&cm_data);
915                            payload
916                        }
917                        Err(e) => {
918                            eprintln!("[neural] LLM init failed, falling back to CM-only: {e}");
919                            let cm_data = cm_compress(&preprocessed, config);
920                            let mut payload = Vec::with_capacity(8 + cm_data.len());
921                            payload.extend_from_slice(&(preprocessed.len() as u64).to_le_bytes());
922                            payload.extend_from_slice(&cm_data);
923                            payload
924                        }
925                    }
926                } else {
927                    eprintln!(
928                        "[neural] no model found, Max mode using CM-only. \
929                         Set DATACORTEX_MODEL or use --model-path."
930                    );
931                    let cm_data = cm_compress(&preprocessed, config);
932                    let mut payload = Vec::with_capacity(8 + cm_data.len());
933                    payload.extend_from_slice(&(preprocessed.len() as u64).to_le_bytes());
934                    payload.extend_from_slice(&cm_data);
935                    payload
936                }
937            }
938
939            #[cfg(not(feature = "neural"))]
940            {
941                let _ = model_path; // suppress unused warning
942                let cm_data = cm_compress(&preprocessed, config);
943                let mut payload = Vec::with_capacity(8 + cm_data.len());
944                payload.extend_from_slice(&(preprocessed.len() as u64).to_le_bytes());
945                payload.extend_from_slice(&cm_data);
946                payload
947            }
948        }
949    };
950
951    // When raw fallback or embedded metadata won, use empty header metadata.
952    // - Raw fallback: decompressor handles empty chains (just decompresses, no reverse transforms).
953    // - Embedded: metadata lives inside the compressed stream, not in the header.
954    let final_metadata = if use_raw_fallback || use_meta_embedded {
955        vec![]
956    } else {
957        transform_metadata
958    };
959
960    // Compress metadata with zstd if it's large enough to benefit.
961    // Small metadata (<= 64 bytes) stays raw to avoid zstd frame overhead.
962    // Skipped when metadata is embedded (final_metadata is empty).
963    let (header_metadata, meta_compressed) = if final_metadata.len() > 64 {
964        let compressed_meta =
965            zstd::bulk::compress(&final_metadata, 19).unwrap_or_else(|_| final_metadata.clone());
966        if compressed_meta.len() < final_metadata.len() {
967            (compressed_meta, true)
968        } else {
969            (final_metadata, false)
970        }
971    } else {
972        (final_metadata, false)
973    };
974
975    let header = DcxHeader {
976        mode,
977        format_hint,
978        original_size: data.len() as u64,
979        compressed_size: compressed.len() as u64,
980        crc32: crc,
981        transform_metadata: header_metadata,
982        has_dict: use_dict,
983        meta_compressed,
984        use_brotli,
985        meta_embedded: use_meta_embedded,
986    };
987
988    header.write_to(output)?;
989    output.write_all(&compressed)?;
990
991    Ok(())
992}
993
994/// Decompress a .dcx file from `input`, returning the original data.
995pub fn decompress<R: Read>(input: &mut R) -> io::Result<Vec<u8>> {
996    decompress_with_model(input, None)
997}
998
999/// Decompress with optional explicit model path (for neural Max mode).
1000pub fn decompress_with_model<R: Read>(
1001    input: &mut R,
1002    model_path: Option<&str>,
1003) -> io::Result<Vec<u8>> {
1004    let header = DcxHeader::read_from(input)?;
1005
1006    let mut compressed = vec![0u8; header.compressed_size as usize];
1007    input.read_exact(&mut compressed)?;
1008
1009    // Step 1: Decompress with engine.
1010    let preprocessed = match header.mode {
1011        Mode::Fast => {
1012            if header.use_brotli {
1013                brotli_decompress(&compressed)?
1014            } else {
1015                let capacity = header.original_size as usize * 2 + 65536;
1016                if header.has_dict {
1017                    decompress_with_dict(&compressed, capacity)?
1018                } else {
1019                    zstd::bulk::decompress(&compressed, capacity)
1020                        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?
1021                }
1022            }
1023        }
1024        Mode::Balanced => {
1025            // Balanced mode: dual-path CM + GRU byte predictor.
1026            if compressed.len() < 8 {
1027                return Err(io::Error::new(
1028                    io::ErrorKind::InvalidData,
1029                    "CM mode compressed data too short",
1030                ));
1031            }
1032            let size_raw = u64::from_le_bytes(compressed[..8].try_into().expect("8-byte slice"));
1033            let preprocessed_size = (size_raw & !(1u64 << 63)) as usize;
1034            let config = cm_config_for_mode(header.mode);
1035            gru_decompress(&compressed[8..], preprocessed_size, config)
1036        }
1037        Mode::Max => {
1038            // Max mode: may use neural (LLM) dual-path or CM-only.
1039            if compressed.len() < 8 {
1040                return Err(io::Error::new(
1041                    io::ErrorKind::InvalidData,
1042                    "CM mode compressed data too short",
1043                ));
1044            }
1045            let size_raw = u64::from_le_bytes(compressed[..8].try_into().expect("8-byte slice"));
1046
1047            // Check if bit 63 is set (neural flag).
1048            let neural_flag = size_raw & (1u64 << 63) != 0;
1049            let preprocessed_size = (size_raw & !(1u64 << 63)) as usize;
1050            let config = cm_config_for_mode(header.mode);
1051
1052            if neural_flag {
1053                #[cfg(feature = "neural")]
1054                {
1055                    if let Some(mpath) = resolve_model_path(model_path) {
1056                        match datacortex_neural::LlmPredictor::new(&mpath) {
1057                            Ok(mut llm) => {
1058                                let mut meta_mixer = datacortex_neural::MetaMixer::new(5);
1059                                eprintln!(
1060                                    "[neural] decompressing with dual-path CM+LLM ({} bytes mapped)",
1061                                    llm.mapped_bytes()
1062                                );
1063                                neural_decompress(
1064                                    &compressed[8..],
1065                                    preprocessed_size,
1066                                    config,
1067                                    &mut llm,
1068                                    &mut meta_mixer,
1069                                )
1070                            }
1071                            Err(e) => {
1072                                return Err(io::Error::new(
1073                                    io::ErrorKind::Other,
1074                                    format!(
1075                                        "file was compressed with neural mode but LLM failed to load: {e}"
1076                                    ),
1077                                ));
1078                            }
1079                        }
1080                    } else {
1081                        return Err(io::Error::new(
1082                            io::ErrorKind::Other,
1083                            "file was compressed with neural mode but no model found. \
1084                             Set DATACORTEX_MODEL or use --model-path.",
1085                        ));
1086                    }
1087                }
1088
1089                #[cfg(not(feature = "neural"))]
1090                {
1091                    let _ = model_path;
1092                    return Err(io::Error::other(
1093                        "file was compressed with neural mode but this build lacks the \
1094                         `neural` feature. Rebuild with --features neural.",
1095                    ));
1096                }
1097            } else {
1098                cm_decompress(&compressed[8..], preprocessed_size, config)
1099            }
1100        }
1101    };
1102
1103    // Step 1.5: Handle embedded metadata OR separate metadata.
1104    // When meta_embedded is set, the decompressed stream starts with:
1105    //   [meta_len: u32 LE][raw_metadata][preprocessed_data]
1106    // We extract the metadata and the actual preprocessed data from the stream.
1107    let (preprocessed, transform_metadata) = if header.meta_embedded {
1108        if preprocessed.len() < 4 {
1109            return Err(io::Error::new(
1110                io::ErrorKind::InvalidData,
1111                "embedded metadata: decompressed stream too short for meta_len",
1112            ));
1113        }
1114        let meta_len =
1115            u32::from_le_bytes(preprocessed[0..4].try_into().expect("4-byte slice")) as usize;
1116        if preprocessed.len() < 4 + meta_len {
1117            return Err(io::Error::new(
1118                io::ErrorKind::InvalidData,
1119                format!(
1120                    "embedded metadata: stream too short for metadata ({} bytes needed, {} available)",
1121                    4 + meta_len,
1122                    preprocessed.len()
1123                ),
1124            ));
1125        }
1126        let metadata = preprocessed[4..4 + meta_len].to_vec();
1127        let actual_preprocessed = preprocessed[4 + meta_len..].to_vec();
1128        (actual_preprocessed, metadata)
1129    } else {
1130        // Decompress metadata if it was zstd-compressed (separate metadata path).
1131        // Use streaming decoder to avoid guessing decompressed size.
1132        let tm = if header.meta_compressed && !header.transform_metadata.is_empty() {
1133            let mut decoder =
1134                zstd::Decoder::new(Cursor::new(&header.transform_metadata)).map_err(|e| {
1135                    io::Error::new(
1136                        io::ErrorKind::InvalidData,
1137                        format!("failed to init metadata decompressor: {e}"),
1138                    )
1139                })?;
1140            let mut decompressed_meta = Vec::new();
1141            decoder.read_to_end(&mut decompressed_meta).map_err(|e| {
1142                io::Error::new(
1143                    io::ErrorKind::InvalidData,
1144                    format!("failed to decompress transform metadata: {e}"),
1145                )
1146            })?;
1147            decompressed_meta
1148        } else {
1149            header.transform_metadata.clone()
1150        };
1151        (preprocessed, tm)
1152    };
1153
1154    // Step 2: Reverse preprocessing.
1155    let data = if transform_metadata.is_empty() {
1156        preprocessed
1157    } else {
1158        let chain = TransformChain::deserialize(&transform_metadata)?;
1159        reverse_preprocess(&preprocessed, &chain)
1160    };
1161
1162    // CRC-32 integrity check.
1163    let crc = crc32fast::hash(&data);
1164    if crc != header.crc32 {
1165        return Err(io::Error::new(
1166            io::ErrorKind::InvalidData,
1167            format!(
1168                "CRC-32 mismatch: expected {:#010X}, got {:#010X}",
1169                header.crc32, crc
1170            ),
1171        ));
1172    }
1173
1174    if data.len() as u64 != header.original_size {
1175        return Err(io::Error::new(
1176            io::ErrorKind::InvalidData,
1177            format!(
1178                "size mismatch: header says {} bytes, got {}",
1179                header.original_size,
1180                data.len()
1181            ),
1182        ));
1183    }
1184
1185    Ok(data)
1186}
1187
1188/// Compress to Vec (convenience).
1189pub fn compress_to_vec(
1190    data: &[u8],
1191    mode: Mode,
1192    format_override: Option<FormatHint>,
1193) -> io::Result<Vec<u8>> {
1194    let mut buf = Vec::new();
1195    compress(data, mode, format_override, &mut buf)?;
1196    Ok(buf)
1197}
1198
1199/// Compress to Vec with explicit model path.
1200pub fn compress_to_vec_with_model(
1201    data: &[u8],
1202    mode: Mode,
1203    format_override: Option<FormatHint>,
1204    model_path: Option<&str>,
1205) -> io::Result<Vec<u8>> {
1206    let mut buf = Vec::new();
1207    compress_with_model(data, mode, format_override, model_path, &mut buf)?;
1208    Ok(buf)
1209}
1210
1211/// Compress to Vec with explicit model path and zstd level override.
1212pub fn compress_to_vec_with_options(
1213    data: &[u8],
1214    mode: Mode,
1215    format_override: Option<FormatHint>,
1216    model_path: Option<&str>,
1217    zstd_level_override: Option<i32>,
1218) -> io::Result<Vec<u8>> {
1219    let mut buf = Vec::new();
1220    compress_with_options(
1221        data,
1222        mode,
1223        format_override,
1224        model_path,
1225        zstd_level_override,
1226        &mut buf,
1227    )?;
1228    Ok(buf)
1229}
1230
1231/// Decompress from slice (convenience).
1232pub fn decompress_from_slice(dcx_data: &[u8]) -> io::Result<Vec<u8>> {
1233    let mut cursor = Cursor::new(dcx_data);
1234    decompress(&mut cursor)
1235}
1236
1237/// Read header only (for `info` command).
1238pub fn read_header<R: Read>(input: &mut R) -> io::Result<DcxHeader> {
1239    DcxHeader::read_from(input)
1240}
1241
1242/// Compress raw data with zstd at a given level (for benchmark comparison).
1243pub fn raw_zstd_compress(data: &[u8], level: i32) -> io::Result<Vec<u8>> {
1244    zstd::bulk::compress(data, level).map_err(io::Error::other)
1245}
1246
1247#[cfg(test)]
1248mod tests {
1249    use super::*;
1250
1251    #[test]
1252    fn fast_mode_roundtrip() {
1253        let original = b"Hello, DataCortex! This is a test of Fast mode compression.";
1254        let compressed = compress_to_vec(original, Mode::Fast, None).unwrap();
1255        let decompressed = decompress_from_slice(&compressed).unwrap();
1256        assert_eq!(decompressed, original);
1257    }
1258
1259    #[test]
1260    fn fast_mode_json_roundtrip() {
1261        let data = br#"{"name":"Alice","age":30,"name":"Bob","age":25,"name":"Carol","age":35}"#;
1262        let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1263        let decompressed = decompress_from_slice(&compressed).unwrap();
1264        assert_eq!(decompressed, data.to_vec());
1265    }
1266
1267    #[test]
1268    fn balanced_mode_roundtrip() {
1269        let original = b"Balanced mode test data with some content.";
1270        let compressed = compress_to_vec(original, Mode::Balanced, None).unwrap();
1271        let decompressed = decompress_from_slice(&compressed).unwrap();
1272        assert_eq!(decompressed, original);
1273    }
1274
1275    #[test]
1276    fn balanced_mode_longer_text() {
1277        let original = b"The quick brown fox jumps over the lazy dog. This sentence contains every letter of the English alphabet at least once. We need enough data to properly exercise the arithmetic coder and order-0 model.";
1278        let compressed = compress_to_vec(original, Mode::Balanced, None).unwrap();
1279        let decompressed = decompress_from_slice(&compressed).unwrap();
1280        assert_eq!(decompressed, original);
1281    }
1282
1283    #[test]
1284    fn balanced_mode_repetitive_data() {
1285        let data = "hello world! ".repeat(100);
1286        let compressed = compress_to_vec(data.as_bytes(), Mode::Balanced, None).unwrap();
1287        let decompressed = decompress_from_slice(&compressed).unwrap();
1288        assert_eq!(decompressed, data.as_bytes());
1289    }
1290
1291    #[test]
1292    fn balanced_mode_all_byte_values() {
1293        let original: Vec<u8> = (0..=255).collect();
1294        let compressed = compress_to_vec(&original, Mode::Balanced, None).unwrap();
1295        let decompressed = decompress_from_slice(&compressed).unwrap();
1296        assert_eq!(decompressed, original);
1297    }
1298
1299    #[test]
1300    fn balanced_mode_single_byte() {
1301        let original = b"X";
1302        let compressed = compress_to_vec(original, Mode::Balanced, None).unwrap();
1303        let decompressed = decompress_from_slice(&compressed).unwrap();
1304        assert_eq!(decompressed, original);
1305    }
1306
1307    #[test]
1308    fn balanced_mode_json_roundtrip() {
1309        let data = br#"{"name":"Alice","age":30,"name":"Bob","age":25,"name":"Carol","age":35}"#;
1310        let compressed = compress_to_vec(data, Mode::Balanced, Some(FormatHint::Json)).unwrap();
1311        let decompressed = decompress_from_slice(&compressed).unwrap();
1312        assert_eq!(decompressed, data.to_vec());
1313    }
1314
1315    #[test]
1316    fn empty_data_roundtrip() {
1317        let original = b"";
1318        for mode in [Mode::Fast, Mode::Balanced, Mode::Max] {
1319            let compressed = compress_to_vec(original, mode, None).unwrap();
1320            let decompressed = decompress_from_slice(&compressed).unwrap();
1321            assert_eq!(decompressed, original, "failed for mode {mode}");
1322        }
1323    }
1324
1325    #[test]
1326    fn crc_mismatch_detected() {
1327        let original = b"test data for CRC check";
1328        let mut compressed = compress_to_vec(original, Mode::Fast, None).unwrap();
1329        // Corrupt in the compressed data section (after header).
1330        let header_size = 32; // minimum header
1331        if compressed.len() > header_size + 5 {
1332            compressed[header_size + 3] ^= 0xFF;
1333        }
1334        assert!(decompress_from_slice(&compressed).is_err());
1335    }
1336
1337    #[test]
1338    fn fast_mode_actually_compresses() {
1339        // Repetitive data should compress well with zstd.
1340        let data = "hello world. ".repeat(100);
1341        let compressed = compress_to_vec(data.as_bytes(), Mode::Fast, None).unwrap();
1342        assert!(
1343            compressed.len() < data.len(),
1344            "Fast mode should compress repetitive data: {} vs {}",
1345            compressed.len(),
1346            data.len()
1347        );
1348    }
1349
1350    #[test]
1351    fn json_preprocessing_improves_fast_mode() {
1352        let data = br#"[{"name":"Alice","score":95},{"name":"Bob","score":87},{"name":"Carol","score":92},{"name":"Dave","score":88},{"name":"Eve","score":91}]"#;
1353        let with_preprocess = compress_to_vec(data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1354        let without_preprocess =
1355            compress_to_vec(data, Mode::Fast, Some(FormatHint::Generic)).unwrap();
1356
1357        // Both should decompress correctly.
1358        assert_eq!(
1359            decompress_from_slice(&with_preprocess).unwrap(),
1360            data.to_vec()
1361        );
1362        assert_eq!(
1363            decompress_from_slice(&without_preprocess).unwrap(),
1364            data.to_vec()
1365        );
1366    }
1367
1368    #[test]
1369    fn all_modes_roundtrip() {
1370        let data = b"test all modes with some more content to ensure decent compression";
1371        for mode in [Mode::Max, Mode::Balanced, Mode::Fast] {
1372            let compressed = compress_to_vec(data, mode, None).unwrap();
1373            let decompressed = decompress_from_slice(&compressed).unwrap();
1374            assert_eq!(decompressed, data, "failed for mode {mode}");
1375        }
1376    }
1377
1378    #[test]
1379    fn cm_compress_decompress_direct() {
1380        let data = b"Hello, World! This is a direct CM test.";
1381        let compressed = cm_compress(data, CMConfig::balanced());
1382        let decompressed = cm_decompress(&compressed, data.len(), CMConfig::balanced());
1383        assert_eq!(decompressed, data.to_vec());
1384    }
1385
1386    #[test]
1387    fn cm_empty() {
1388        let data: &[u8] = b"";
1389        let compressed = cm_compress(data, CMConfig::balanced());
1390        let decompressed = cm_decompress(&compressed, 0, CMConfig::balanced());
1391        assert!(decompressed.is_empty());
1392    }
1393
1394    #[test]
1395    fn cm_single_byte() {
1396        for byte in 0..=255u8 {
1397            let data = [byte];
1398            let compressed = cm_compress(&data, CMConfig::balanced());
1399            let decompressed = cm_decompress(&compressed, 1, CMConfig::balanced());
1400            assert_eq!(
1401                decompressed, data,
1402                "CM roundtrip failed for byte {byte:#04X}"
1403            );
1404        }
1405    }
1406
1407    #[test]
1408    fn cm_repetitive_compresses() {
1409        let data = vec![b'A'; 1000];
1410        let compressed = cm_compress(&data, CMConfig::balanced());
1411        // 1000 identical bytes should compress well with adaptive model.
1412        assert!(
1413            compressed.len() < 200,
1414            "CM should compress 1000 identical bytes well: {} bytes",
1415            compressed.len()
1416        );
1417        let decompressed = cm_decompress(&compressed, data.len(), CMConfig::balanced());
1418        assert_eq!(decompressed, data);
1419    }
1420
1421    #[test]
1422    fn max_mode_roundtrip() {
1423        let original = b"Max mode test data with some content for compression.";
1424        let compressed = compress_to_vec(original, Mode::Max, None).unwrap();
1425        let decompressed = decompress_from_slice(&compressed).unwrap();
1426        assert_eq!(decompressed, original);
1427    }
1428
1429    #[test]
1430    fn max_mode_longer_text() {
1431        let original = b"The quick brown fox jumps over the lazy dog. Max mode uses 2x context maps for better predictions with fewer hash collisions. This should compress slightly better than balanced mode.";
1432        let compressed = compress_to_vec(original, Mode::Max, None).unwrap();
1433        let decompressed = decompress_from_slice(&compressed).unwrap();
1434        assert_eq!(decompressed, original);
1435    }
1436
1437    // ─── Dictionary compression tests ──────────────────────────────────────────
1438
1439    #[test]
1440    fn test_dict_compress_roundtrip() {
1441        // Generate NDJSON data large enough to trigger dictionary training.
1442        // Repetitive columnar data is ideal for dictionary learning.
1443        let mut ndjson = String::new();
1444        for i in 0..500 {
1445            ndjson.push_str(&format!(
1446                r#"{{"id":{},"name":"user_{}","status":"active","score":{}}}"#,
1447                i,
1448                i,
1449                i * 17 % 100
1450            ));
1451            ndjson.push('\n');
1452        }
1453        let data = ndjson.as_bytes();
1454        assert!(
1455            data.len() > DICT_MIN_DATA_SIZE,
1456            "test data should exceed dict threshold: {} bytes",
1457            data.len()
1458        );
1459
1460        let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1461        let decompressed = decompress_from_slice(&compressed).unwrap();
1462        assert_eq!(
1463            decompressed, data,
1464            "dict compress roundtrip: byte-exact mismatch"
1465        );
1466    }
1467
1468    #[test]
1469    fn test_dict_falls_back_on_small() {
1470        // Data smaller than DICT_MIN_DATA_SIZE should not use dictionary.
1471        let data = b"small data that won't trigger dictionary training";
1472        assert!(data.len() < DICT_MIN_DATA_SIZE);
1473
1474        let compressed = compress_to_vec(data, Mode::Fast, None).unwrap();
1475        let decompressed = decompress_from_slice(&compressed).unwrap();
1476        assert_eq!(decompressed, data.to_vec());
1477
1478        // Verify no dict flag in header.
1479        let mut cursor = Cursor::new(&compressed);
1480        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1481        assert!(!header.has_dict, "small data should not have dict flag set");
1482    }
1483
1484    #[test]
1485    fn test_dict_backward_compat() {
1486        // Compress with old behavior (no dict) and verify it still decompresses.
1487        // We simulate this by compressing small data (which skips dict).
1488        let original = b"backward compatibility test data for decompression";
1489        let compressed = compress_to_vec(original, Mode::Fast, None).unwrap();
1490
1491        // Verify the flag is NOT set.
1492        let mut cursor = Cursor::new(&compressed);
1493        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1494        assert!(!header.has_dict);
1495
1496        // Decompress should work fine.
1497        let decompressed = decompress_from_slice(&compressed).unwrap();
1498        assert_eq!(decompressed, original.to_vec());
1499    }
1500
1501    #[test]
1502    fn test_dict_ndjson_large_roundtrip() {
1503        // Larger NDJSON dataset — should benefit from dictionary.
1504        let mut ndjson = String::new();
1505        for i in 0..2000 {
1506            ndjson.push_str(&format!(
1507                r#"{{"timestamp":"2025-01-{:02}T{:02}:{:02}:00Z","level":"info","message":"Request processed","request_id":"req_{}","duration_ms":{}}}"#,
1508                (i % 28) + 1,
1509                i % 24,
1510                i % 60,
1511                i,
1512                (i * 13) % 500
1513            ));
1514            ndjson.push('\n');
1515        }
1516        let data = ndjson.as_bytes();
1517
1518        let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1519        let decompressed = decompress_from_slice(&compressed).unwrap();
1520        assert_eq!(decompressed, data, "large NDJSON roundtrip mismatch");
1521    }
1522
1523    #[test]
1524    fn test_dict_generic_data_roundtrip() {
1525        // Generic (non-JSON) data that's large enough for dict training.
1526        // Uses fixed-size block splitting instead of column boundaries.
1527        let mut data = Vec::new();
1528        for i in 0..3000 {
1529            data.extend_from_slice(
1530                format!("line {i}: the quick brown fox jumps over the lazy dog\n").as_bytes(),
1531            );
1532        }
1533        assert!(data.len() > DICT_MIN_DATA_SIZE);
1534
1535        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Generic)).unwrap();
1536        let decompressed = decompress_from_slice(&compressed).unwrap();
1537        assert_eq!(decompressed, data, "generic data dict roundtrip mismatch");
1538    }
1539
1540    #[test]
1541    fn test_dict_does_not_affect_other_modes() {
1542        // Dictionary training should only apply to Fast mode.
1543        // Balanced and Max modes should remain unchanged.
1544        let mut ndjson = String::new();
1545        for i in 0..200 {
1546            ndjson.push_str(&format!(
1547                r#"{{"id":{},"name":"user_{}","status":"active"}}"#,
1548                i, i
1549            ));
1550            ndjson.push('\n');
1551        }
1552        let data = ndjson.as_bytes();
1553
1554        for mode in [Mode::Balanced, Mode::Max] {
1555            let compressed = compress_to_vec(data, mode, Some(FormatHint::Ndjson)).unwrap();
1556            let mut cursor = Cursor::new(&compressed);
1557            let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1558            assert!(!header.has_dict, "mode {mode} should never have dict flag");
1559            let decompressed = decompress_from_slice(&compressed).unwrap();
1560            assert_eq!(decompressed, data, "roundtrip failed for mode {mode}");
1561        }
1562    }
1563
1564    // ─── Configurable zstd level tests ──────────────────────────────────────
1565
1566    #[test]
1567    fn test_compress_with_level() {
1568        // Compress with level 19 override in Fast mode, verify roundtrip.
1569        let data = "hello world, compressing with custom zstd level. ".repeat(50);
1570        let compressed =
1571            compress_to_vec_with_options(data.as_bytes(), Mode::Fast, None, None, Some(19))
1572                .unwrap();
1573        let decompressed = decompress_from_slice(&compressed).unwrap();
1574        assert_eq!(decompressed, data.as_bytes(), "level 19 roundtrip failed");
1575    }
1576
1577    #[test]
1578    fn test_compress_with_level_default() {
1579        // No level override — should use mode default (9 for Fast).
1580        let data = "default level test data. ".repeat(50);
1581        let compressed =
1582            compress_to_vec_with_options(data.as_bytes(), Mode::Fast, None, None, None).unwrap();
1583        let decompressed = decompress_from_slice(&compressed).unwrap();
1584        assert_eq!(
1585            decompressed,
1586            data.as_bytes(),
1587            "default level roundtrip failed"
1588        );
1589    }
1590
1591    #[test]
1592    fn test_compress_with_level_higher_ratio() {
1593        // Level 19 should compress better than level 1 on repetitive data.
1594        let data = r#"{"name":"Alice","score":95}"#.repeat(200);
1595        let low =
1596            compress_to_vec_with_options(data.as_bytes(), Mode::Fast, None, None, Some(1)).unwrap();
1597        let high = compress_to_vec_with_options(data.as_bytes(), Mode::Fast, None, None, Some(19))
1598            .unwrap();
1599
1600        // Both must roundtrip.
1601        assert_eq!(decompress_from_slice(&low).unwrap(), data.as_bytes());
1602        assert_eq!(decompress_from_slice(&high).unwrap(), data.as_bytes());
1603
1604        // Higher level should produce smaller output (or at least not larger).
1605        assert!(
1606            high.len() <= low.len(),
1607            "level 19 ({}) should be <= level 1 ({})",
1608            high.len(),
1609            low.len()
1610        );
1611    }
1612
1613    // ─── Auto-fallback tests ──────────────────────────────────────────────────
1614
1615    #[test]
1616    fn test_auto_fallback_picks_smaller() {
1617        // citm_catalog.json has extreme repetition. The auto-fallback picks
1618        // whichever path (raw or preprocessed) produces the smallest output.
1619        // With compressed metadata, the preprocessed path may now win.
1620        let data = std::fs::read(concat!(
1621            env!("CARGO_MANIFEST_DIR"),
1622            "/../../corpus/json-bench/citm_catalog.json"
1623        ))
1624        .unwrap();
1625
1626        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1627        let decompressed = decompress_from_slice(&compressed).unwrap();
1628        assert_eq!(decompressed, data, "citm_catalog roundtrip failed");
1629
1630        // Verify good compression ratio regardless of which path won.
1631        let ratio = data.len() as f64 / compressed.len() as f64;
1632        assert!(
1633            ratio > 50.0,
1634            "citm_catalog should achieve >50x, got {ratio:.1}x"
1635        );
1636    }
1637
1638    #[test]
1639    fn test_auto_fallback_preprocessed_wins_on_ndjson() {
1640        // NDJSON with uniform schema should still prefer preprocessed path
1641        // (columnar + typed encoding beats raw zstd for structured data).
1642        let data = std::fs::read(concat!(
1643            env!("CARGO_MANIFEST_DIR"),
1644            "/../../corpus/test-ndjson.ndjson"
1645        ))
1646        .unwrap();
1647
1648        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1649        let decompressed = decompress_from_slice(&compressed).unwrap();
1650        assert_eq!(decompressed, data, "test-ndjson roundtrip failed");
1651
1652        // Check that preprocessing was used: either non-empty transform metadata
1653        // in the header, or metadata embedded in the compressed stream.
1654        let mut cursor = Cursor::new(&compressed);
1655        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1656        assert!(
1657            !header.transform_metadata.is_empty() || header.meta_embedded,
1658            "test-ndjson should prefer preprocessed path (non-empty transform metadata or embedded)"
1659        );
1660    }
1661
1662    #[test]
1663    fn test_auto_fallback_roundtrip() {
1664        // Verify both raw and preprocessed paths produce correct roundtrips.
1665        // Use citm_catalog (raw wins) and test-ndjson (preprocessed wins).
1666        let citm = std::fs::read(concat!(
1667            env!("CARGO_MANIFEST_DIR"),
1668            "/../../corpus/json-bench/citm_catalog.json"
1669        ))
1670        .unwrap();
1671        let ndjson = std::fs::read(concat!(
1672            env!("CARGO_MANIFEST_DIR"),
1673            "/../../corpus/test-ndjson.ndjson"
1674        ))
1675        .unwrap();
1676
1677        // citm_catalog — raw path
1678        let compressed_citm = compress_to_vec(&citm, Mode::Fast, Some(FormatHint::Json)).unwrap();
1679        let decompressed_citm = decompress_from_slice(&compressed_citm).unwrap();
1680        assert_eq!(
1681            decompressed_citm, citm,
1682            "citm_catalog roundtrip (raw path) failed"
1683        );
1684
1685        // test-ndjson — preprocessed path
1686        let compressed_ndjson =
1687            compress_to_vec(&ndjson, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1688        let decompressed_ndjson = decompress_from_slice(&compressed_ndjson).unwrap();
1689        assert_eq!(
1690            decompressed_ndjson, ndjson,
1691            "test-ndjson roundtrip (preprocessed path) failed"
1692        );
1693    }
1694
1695    // ─── Adaptive level tests ─────────────────────────────────────────────────
1696
1697    #[test]
1698    fn test_adaptive_level_small_data() {
1699        // <1MB should use level 19 (zstd-19 is <50ms on small data).
1700        assert_eq!(adaptive_fast_level(100_000, None), 19);
1701        assert_eq!(adaptive_fast_level(500_000, None), 19);
1702        assert_eq!(adaptive_fast_level(1_048_576, None), 19);
1703        assert_eq!(adaptive_fast_level(0, None), 19);
1704    }
1705
1706    #[test]
1707    fn test_adaptive_level_large_data() {
1708        // 1MB-10MB should use level 13, >10MB should use level 9.
1709        assert_eq!(adaptive_fast_level(1_048_577, None), 13);
1710        assert_eq!(adaptive_fast_level(5_000_000, None), 13);
1711        assert_eq!(adaptive_fast_level(10_485_760, None), 13);
1712        assert_eq!(adaptive_fast_level(10_485_761, None), 9);
1713        assert_eq!(adaptive_fast_level(100_000_000, None), 9);
1714    }
1715
1716    #[test]
1717    fn test_adaptive_level_override() {
1718        // --level flag should always override adaptive level.
1719        assert_eq!(adaptive_fast_level(100, Some(3)), 3);
1720        assert_eq!(adaptive_fast_level(100_000_000, Some(22)), 22);
1721        assert_eq!(adaptive_fast_level(0, Some(1)), 1);
1722    }
1723
1724    // ─── Compressed metadata tests ──────────────────────────────────────────────
1725
1726    #[test]
1727    fn test_compressed_metadata_roundtrip() {
1728        // Generate NDJSON data large enough to produce > 64 bytes of transform metadata.
1729        let mut ndjson = String::new();
1730        for i in 0..500 {
1731            ndjson.push_str(&format!(
1732                r#"{{"id":{},"name":"user_{}","status":"active","score":{}}}"#,
1733                i,
1734                i,
1735                i * 17 % 100
1736            ));
1737            ndjson.push('\n');
1738        }
1739        let data = ndjson.as_bytes();
1740
1741        let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1742        let decompressed = decompress_from_slice(&compressed).unwrap();
1743        assert_eq!(
1744            decompressed, data,
1745            "compressed metadata roundtrip: byte-exact mismatch"
1746        );
1747
1748        // Verify the header has meta_compressed set if metadata was large enough.
1749        let mut cursor = Cursor::new(&compressed);
1750        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1751        // The file should have used preprocessed path (non-empty metadata).
1752        if !header.transform_metadata.is_empty() && header.transform_metadata.len() > 10 {
1753            // Metadata was present — check that compressed flag makes sense.
1754            // (meta_compressed is true only if compression actually saved space)
1755            // Just verify roundtrip was correct — the flag is an optimization detail.
1756        }
1757    }
1758
1759    #[test]
1760    fn test_compressed_metadata_backward_compat() {
1761        // Simulate old files that have no compressed metadata (bit 2 = 0).
1762        // These should still decompress correctly.
1763        let original = b"backward compatibility test data for metadata decompression";
1764        let compressed = compress_to_vec(original, Mode::Fast, None).unwrap();
1765
1766        // Verify decompression works.
1767        let decompressed = decompress_from_slice(&compressed).unwrap();
1768        assert_eq!(decompressed, original.to_vec());
1769
1770        // For small data, metadata should be empty or very small — no compression.
1771        let mut cursor = Cursor::new(&compressed);
1772        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1773        // Small data may or may not have metadata, but it should roundtrip either way.
1774        assert!(!header.meta_compressed || !header.transform_metadata.is_empty());
1775    }
1776
1777    #[test]
1778    fn test_compressed_metadata_small_skipped() {
1779        // Small metadata (< 64 bytes) should NOT be compressed — zstd frame overhead
1780        // would make it larger.
1781        let data = br#"{"name":"Alice","age":30}"#;
1782        let compressed = compress_to_vec(data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1783        let decompressed = decompress_from_slice(&compressed).unwrap();
1784        assert_eq!(decompressed, data.to_vec());
1785
1786        let mut cursor = Cursor::new(&compressed);
1787        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1788        // Small JSON has small metadata — should not be compressed.
1789        if header.transform_metadata.len() <= 64 {
1790            assert!(
1791                !header.meta_compressed,
1792                "metadata <= 64 bytes should not be compressed, but meta_compressed=true \
1793                 for {} bytes of metadata",
1794                header.transform_metadata.len()
1795            );
1796        }
1797    }
1798
1799    #[test]
1800    fn test_twitter_json_brotli_wins() {
1801        // twitter.json should use brotli — raw brotli-11 beats both preprocessed+zstd
1802        // and raw+zstd on this file.
1803        let data = std::fs::read(concat!(
1804            env!("CARGO_MANIFEST_DIR"),
1805            "/../../corpus/json-bench/twitter.json"
1806        ))
1807        .unwrap();
1808
1809        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1810        let decompressed = decompress_from_slice(&compressed).unwrap();
1811        assert_eq!(decompressed, data, "twitter.json roundtrip failed");
1812
1813        // Check that brotli was selected.
1814        let mut cursor = Cursor::new(&compressed);
1815        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1816        assert!(
1817            header.use_brotli,
1818            "twitter.json should use brotli (FLAG_BROTLI set in header)"
1819        );
1820    }
1821
1822    #[test]
1823    fn test_compressed_metadata_all_modes_roundtrip() {
1824        // Metadata compression applies to all modes, not just Fast.
1825        let mut ndjson = String::new();
1826        for i in 0..200 {
1827            ndjson.push_str(&format!(
1828                r#"{{"id":{},"name":"user_{}","status":"active"}}"#,
1829                i, i
1830            ));
1831            ndjson.push('\n');
1832        }
1833        let data = ndjson.as_bytes();
1834
1835        for mode in [Mode::Fast, Mode::Balanced, Mode::Max] {
1836            let compressed = compress_to_vec(data, mode, Some(FormatHint::Ndjson)).unwrap();
1837            let decompressed = decompress_from_slice(&compressed).unwrap();
1838            assert_eq!(
1839                decompressed, data,
1840                "compressed metadata roundtrip failed for mode {mode}"
1841            );
1842        }
1843    }
1844
1845    // ─── Brotli auto-fallback tests ──────────────────────────────────────────
1846
1847    #[test]
1848    fn test_brotli_compress_roundtrip() {
1849        // Direct brotli compress/decompress helper roundtrip.
1850        let data = b"Hello, brotli! This is a test of the brotli compression helpers.";
1851        let compressed = brotli_compress(data, 11, BROTLI_MODE_GENERIC).unwrap();
1852        let decompressed = brotli_decompress(&compressed).unwrap();
1853        assert_eq!(decompressed, data.to_vec());
1854    }
1855
1856    #[test]
1857    fn test_brotli_auto_fallback_twitter() {
1858        // twitter.json should select brotli and roundtrip correctly.
1859        let data = std::fs::read(concat!(
1860            env!("CARGO_MANIFEST_DIR"),
1861            "/../../corpus/json-bench/twitter.json"
1862        ))
1863        .unwrap();
1864
1865        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1866        let decompressed = decompress_from_slice(&compressed).unwrap();
1867        assert_eq!(decompressed, data, "twitter.json brotli roundtrip failed");
1868
1869        let mut cursor = Cursor::new(&compressed);
1870        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
1871        assert!(
1872            header.use_brotli,
1873            "twitter.json should use brotli in auto-fallback"
1874        );
1875    }
1876
1877    #[test]
1878    fn test_brotli_ndjson_roundtrip() {
1879        // NDJSON with uniform schema — regardless of which entropy coder wins,
1880        // the roundtrip must be byte-exact.
1881        let data = std::fs::read(concat!(
1882            env!("CARGO_MANIFEST_DIR"),
1883            "/../../corpus/test-ndjson.ndjson"
1884        ))
1885        .unwrap();
1886
1887        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
1888        let decompressed = decompress_from_slice(&compressed).unwrap();
1889        assert_eq!(decompressed, data, "ndjson roundtrip failed");
1890    }
1891
1892    #[test]
1893    fn test_brotli_backward_compat() {
1894        // Old .dcx files without the brotli flag (bit 3 = 0) must still decompress.
1895        // We simulate an old file by manually crafting a .dcx with FLAG_BROTLI unset.
1896        // Compress with zstd directly and build a minimal .dcx header.
1897        let original = b"backward compatibility test: this data was compressed without brotli";
1898        let crc = crc32fast::hash(original);
1899        let zstd_compressed = zstd::bulk::compress(original, 19).unwrap();
1900
1901        let header = crate::dcx::DcxHeader {
1902            mode: Mode::Fast,
1903            format_hint: crate::dcx::FormatHint::Generic,
1904            original_size: original.len() as u64,
1905            compressed_size: zstd_compressed.len() as u64,
1906            crc32: crc,
1907            transform_metadata: vec![],
1908            has_dict: false,
1909            meta_compressed: false,
1910            use_brotli: false,
1911            meta_embedded: false,
1912        };
1913
1914        let mut buf = Vec::new();
1915        header.write_to(&mut buf).unwrap();
1916        buf.extend_from_slice(&zstd_compressed);
1917
1918        // Verify the brotli flag is NOT set in the serialized header.
1919        assert_eq!(buf[7] & crate::dcx::FLAG_BROTLI, 0);
1920
1921        // Decompress — must work even though brotli path exists.
1922        let decompressed = decompress_from_slice(&buf).unwrap();
1923        assert_eq!(decompressed, original.to_vec());
1924    }
1925
1926    // ─── Embedded metadata tests ──────────────────────────────────────────────
1927
1928    #[test]
1929    fn test_embedded_metadata_roundtrip() {
1930        // Compress test-api.json with Fast mode — if embedded metadata is used,
1931        // the roundtrip must be byte-exact.
1932        let data = std::fs::read(concat!(
1933            env!("CARGO_MANIFEST_DIR"),
1934            "/../../corpus/test-api.json"
1935        ))
1936        .unwrap();
1937
1938        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1939        let decompressed = decompress_from_slice(&compressed).unwrap();
1940        assert_eq!(
1941            decompressed, data,
1942            "test-api.json embedded metadata roundtrip: byte-exact mismatch"
1943        );
1944    }
1945
1946    #[test]
1947    fn test_embedded_metadata_backward_compat() {
1948        // Old .dcx files without the meta_embedded flag (bit 4 = 0) must still decompress.
1949        // We simulate an old file by manually crafting a .dcx with FLAG_META_EMBEDDED unset
1950        // and separate transform metadata.
1951        let original = b"backward compat: no embedded metadata in this old file format";
1952        let crc = crc32fast::hash(original);
1953        let zstd_compressed = zstd::bulk::compress(original, 19).unwrap();
1954
1955        let header = crate::dcx::DcxHeader {
1956            mode: Mode::Fast,
1957            format_hint: crate::dcx::FormatHint::Generic,
1958            original_size: original.len() as u64,
1959            compressed_size: zstd_compressed.len() as u64,
1960            crc32: crc,
1961            transform_metadata: vec![],
1962            has_dict: false,
1963            meta_compressed: false,
1964            use_brotli: false,
1965            meta_embedded: false,
1966        };
1967
1968        let mut buf = Vec::new();
1969        header.write_to(&mut buf).unwrap();
1970        buf.extend_from_slice(&zstd_compressed);
1971
1972        // Verify meta_embedded flag is NOT set.
1973        assert_eq!(buf[7] & crate::dcx::FLAG_META_EMBEDDED, 0);
1974
1975        // Decompress — must work without embedded metadata support.
1976        let decompressed = decompress_from_slice(&buf).unwrap();
1977        assert_eq!(decompressed, original.to_vec());
1978    }
1979
1980    #[test]
1981    fn test_embedded_metadata_small_file_improvement() {
1982        // test-api.json is a small file (37KB) where embedded metadata should
1983        // save overhead compared to separate metadata.
1984        let data = std::fs::read(concat!(
1985            env!("CARGO_MANIFEST_DIR"),
1986            "/../../corpus/test-api.json"
1987        ))
1988        .unwrap();
1989
1990        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Json)).unwrap();
1991        let decompressed = decompress_from_slice(&compressed).unwrap();
1992        assert_eq!(decompressed, data, "roundtrip failed");
1993
1994        // Verify the file compresses to a reasonable size.
1995        let ratio = data.len() as f64 / compressed.len() as f64;
1996        assert!(
1997            ratio > 5.0,
1998            "test-api.json should achieve >5x compression, got {ratio:.1}x"
1999        );
2000
2001        // Check header to see which path was chosen.
2002        let mut cursor = Cursor::new(&compressed);
2003        let header = crate::dcx::DcxHeader::read_from(&mut cursor).unwrap();
2004
2005        // If embedded was chosen, verify the flag is set and header metadata is empty.
2006        if header.meta_embedded {
2007            assert!(
2008                header.transform_metadata.is_empty(),
2009                "meta_embedded header should have empty transform_metadata"
2010            );
2011            assert!(header.use_brotli, "meta_embedded should use brotli codec");
2012        }
2013    }
2014
2015    #[test]
2016    fn test_embedded_metadata_ndjson_roundtrip() {
2017        // NDJSON files with transforms must still roundtrip correctly
2018        // regardless of whether embedded or separate metadata is chosen.
2019        let data = std::fs::read(concat!(
2020            env!("CARGO_MANIFEST_DIR"),
2021            "/../../corpus/test-ndjson.ndjson"
2022        ))
2023        .unwrap();
2024
2025        let compressed = compress_to_vec(&data, Mode::Fast, Some(FormatHint::Ndjson)).unwrap();
2026        let decompressed = decompress_from_slice(&compressed).unwrap();
2027        assert_eq!(
2028            decompressed, data,
2029            "NDJSON embedded metadata roundtrip: byte-exact mismatch"
2030        );
2031    }
2032
2033    #[test]
2034    fn test_embedded_metadata_manual_roundtrip() {
2035        // Manually construct an embedded-metadata .dcx to verify the decompress path
2036        // handles the format correctly, independent of what the compressor chooses.
2037        let original = b"Hello, embedded metadata world! This is a test.";
2038        let crc = crc32fast::hash(original);
2039
2040        // Build embedded payload with an empty transform chain so reverse_preprocess
2041        // is a no-op and the data passes through unchanged.
2042        let empty_chain = TransformChain::new();
2043        let raw_metadata = empty_chain.serialize();
2044
2045        // Build embedded payload: [meta_len:u32 LE][raw_metadata][original_data]
2046        let mut embedded = Vec::new();
2047        embedded.extend_from_slice(&(raw_metadata.len() as u32).to_le_bytes());
2048        embedded.extend_from_slice(&raw_metadata);
2049        embedded.extend_from_slice(original);
2050
2051        let brotli_data = brotli_compress(&embedded, 11, BROTLI_MODE_GENERIC).unwrap();
2052
2053        let header = crate::dcx::DcxHeader {
2054            mode: Mode::Fast,
2055            format_hint: crate::dcx::FormatHint::Generic,
2056            original_size: original.len() as u64,
2057            compressed_size: brotli_data.len() as u64,
2058            crc32: crc,
2059            transform_metadata: vec![], // empty — metadata is embedded
2060            has_dict: false,
2061            meta_compressed: false,
2062            use_brotli: true,
2063            meta_embedded: true,
2064        };
2065
2066        let mut buf = Vec::new();
2067        header.write_to(&mut buf).unwrap();
2068        buf.extend_from_slice(&brotli_data);
2069
2070        // Verify flags.
2071        assert_ne!(buf[7] & crate::dcx::FLAG_META_EMBEDDED, 0);
2072        assert_ne!(buf[7] & crate::dcx::FLAG_BROTLI, 0);
2073
2074        // Decompress and verify.
2075        let decompressed = decompress_from_slice(&buf).unwrap();
2076        assert_eq!(decompressed, original.to_vec());
2077    }
2078
2079    // ─── Optimization: Brotli TEXT mode tests ───────────────────────────────
2080
2081    #[test]
2082    fn test_brotli_text_mode_on_raw() {
2083        // Verify TEXT mode produces valid brotli that decompresses correctly.
2084        let data = br#"{"name":"Alice","age":30,"city":"New York","active":true}"#;
2085
2086        // TEXT mode (for raw UTF-8/JSON).
2087        let compressed_text = brotli_compress(data, 11, BROTLI_MODE_TEXT).unwrap();
2088        let decompressed_text = brotli_decompress(&compressed_text).unwrap();
2089        assert_eq!(
2090            decompressed_text,
2091            data.to_vec(),
2092            "TEXT mode roundtrip failed"
2093        );
2094
2095        // GENERIC mode (for comparison).
2096        let compressed_generic = brotli_compress(data, 11, BROTLI_MODE_GENERIC).unwrap();
2097        let decompressed_generic = brotli_decompress(&compressed_generic).unwrap();
2098        assert_eq!(
2099            decompressed_generic,
2100            data.to_vec(),
2101            "GENERIC mode roundtrip failed"
2102        );
2103
2104        // Both must produce valid output — TEXT mode should not be larger than
2105        // GENERIC on UTF-8 text (or at most within a few bytes).
2106        // We don't assert TEXT < GENERIC because on tiny data the difference is negligible,
2107        // but we verify the feature works.
2108        assert!(
2109            !compressed_text.is_empty(),
2110            "TEXT mode should produce non-empty output"
2111        );
2112    }
2113
2114    // ─── Optimization: Zstd embedded metadata tests ─────────────────────────
2115
2116    #[test]
2117    fn test_zstd_embedded_metadata_roundtrip() {
2118        // Manually construct a .dcx with zstd-compressed embedded metadata
2119        // (meta_embedded=true, use_brotli=false) and verify roundtrip.
2120        let original = b"Hello, zstd embedded metadata! This is a test of the zstd path.";
2121        let crc = crc32fast::hash(original);
2122
2123        // Build embedded payload with an empty transform chain.
2124        let empty_chain = TransformChain::new();
2125        let raw_metadata = empty_chain.serialize();
2126
2127        // [meta_len:u32 LE][raw_metadata][original_data]
2128        let mut embedded = Vec::new();
2129        embedded.extend_from_slice(&(raw_metadata.len() as u32).to_le_bytes());
2130        embedded.extend_from_slice(&raw_metadata);
2131        embedded.extend_from_slice(original);
2132
2133        let zstd_data = zstd::bulk::compress(&embedded, 19).unwrap();
2134
2135        let header = crate::dcx::DcxHeader {
2136            mode: Mode::Fast,
2137            format_hint: crate::dcx::FormatHint::Generic,
2138            original_size: original.len() as u64,
2139            compressed_size: zstd_data.len() as u64,
2140            crc32: crc,
2141            transform_metadata: vec![], // empty — metadata is embedded
2142            has_dict: false,
2143            meta_compressed: false,
2144            use_brotli: false, // zstd, not brotli
2145            meta_embedded: true,
2146        };
2147
2148        let mut buf = Vec::new();
2149        header.write_to(&mut buf).unwrap();
2150        buf.extend_from_slice(&zstd_data);
2151
2152        // Verify flags: meta_embedded set, brotli NOT set.
2153        assert_ne!(buf[7] & crate::dcx::FLAG_META_EMBEDDED, 0);
2154        assert_eq!(buf[7] & crate::dcx::FLAG_BROTLI, 0);
2155
2156        // Decompress and verify byte-exact roundtrip.
2157        let decompressed = decompress_from_slice(&buf).unwrap();
2158        assert_eq!(decompressed, original.to_vec());
2159    }
2160
2161    // ─── Optimization: Multi-quality brotli tests ───────────────────────────
2162
2163    #[test]
2164    fn test_multi_quality_brotli() {
2165        // Verify both quality 10 and 11 produce valid brotli that decompresses.
2166        // On some data q10 beats q11 — we just verify both work correctly.
2167        let data = br#"{"items":[1,2,3,4,5],"nested":{"a":"hello","b":"world"}}"#;
2168
2169        let q10 = brotli_compress(data, 10, BROTLI_MODE_GENERIC).unwrap();
2170        let q11 = brotli_compress(data, 11, BROTLI_MODE_GENERIC).unwrap();
2171
2172        let dec_q10 = brotli_decompress(&q10).unwrap();
2173        let dec_q11 = brotli_decompress(&q11).unwrap();
2174
2175        assert_eq!(dec_q10, data.to_vec(), "quality 10 roundtrip failed");
2176        assert_eq!(dec_q11, data.to_vec(), "quality 11 roundtrip failed");
2177
2178        // Both should produce non-empty compressed output.
2179        assert!(!q10.is_empty());
2180        assert!(!q11.is_empty());
2181
2182        // The auto-fallback should pick the smaller one.
2183        // We can't assert which is smaller (data-dependent), but verify the logic
2184        // by checking that auto-fallback roundtrips on real corpus files.
2185        let corpus_files = [
2186            concat!(env!("CARGO_MANIFEST_DIR"), "/../../corpus/test-api.json"),
2187            concat!(
2188                env!("CARGO_MANIFEST_DIR"),
2189                "/../../corpus/json-bench/twitter.json"
2190            ),
2191        ];
2192        for path in corpus_files {
2193            let file_data = std::fs::read(path).unwrap();
2194            let compressed =
2195                compress_to_vec(&file_data, Mode::Fast, Some(FormatHint::Json)).unwrap();
2196            let decompressed = decompress_from_slice(&compressed).unwrap();
2197            assert_eq!(
2198                decompressed, file_data,
2199                "multi-quality roundtrip failed for {path}"
2200            );
2201        }
2202    }
2203}
datacortex_core/codec.rs

datacortex_core/
codec.rs