Skip to main content

compress_to_vec

Function compress_to_vec 

Source
pub fn compress_to_vec(
    data: &[u8],
    mode: Mode,
    format_override: Option<FormatHint>,
) -> Result<Vec<u8>>
Expand description

Compress to Vec (convenience).

Examples found in repository?
examples/validation_ab_test.rs (line 32)
17fn main() {
18    // Read test file.
19    let data =
20        fs::read("corpus/test-ndjson.ndjson").expect("failed to read corpus/test-ndjson.ndjson");
21    println!("Original NDJSON: {} bytes", data.len());
22
23    // Apply NDJSON columnar transform (low-level, no value dict).
24    let result = datacortex_core::format::ndjson::preprocess(&data)
25        .expect("ndjson preprocess failed — is the file valid uniform NDJSON?");
26    let columnar = result.data;
27    println!("Columnar data:   {} bytes", columnar.len());
28
29    // --- Variant A: compress original columnar data with CM ---
30    // Use Generic format hint so compress_to_vec does NOT re-apply transforms.
31    eprintln!("[A] Compressing columnar data (this takes a while)...");
32    let compressed_a = compress_to_vec(&columnar, Mode::Balanced, Some(FormatHint::Generic))
33        .expect("compress variant A");
34    let bpb_a = compressed_a.len() as f64 * 8.0 / data.len() as f64;
35
36    // --- Variant B: strip quotes then compress ---
37    let stripped = strip_quotes(&columnar);
38    let quotes_removed = columnar.len() - stripped.len();
39    println!(
40        "Quote-stripped:   {} bytes ({:.1}% of columnar, {} quote bytes removed)",
41        stripped.len(),
42        stripped.len() as f64 / columnar.len() as f64 * 100.0,
43        quotes_removed,
44    );
45
46    eprintln!("[B] Compressing quote-stripped data (this takes a while)...");
47    let compressed_b = compress_to_vec(&stripped, Mode::Balanced, Some(FormatHint::Generic))
48        .expect("compress variant B");
49    let bpb_b = compressed_b.len() as f64 * 8.0 / data.len() as f64;
50
51    // --- Results ---
52    println!();
53    println!("=== RESULTS ===");
54    println!(
55        "Variant A (columnar, no strip): {} bytes, {:.3} bpb",
56        compressed_a.len(),
57        bpb_a
58    );
59    println!(
60        "Variant B (quote-stripped):      {} bytes, {:.3} bpb",
61        compressed_b.len(),
62        bpb_b
63    );
64
65    let improvement =
66        (compressed_a.len() as f64 - compressed_b.len() as f64) / compressed_a.len() as f64 * 100.0;
67    println!("Improvement: {:.1}%", improvement);
68
69    if compressed_b.len() < compressed_a.len() {
70        println!();
71        println!(
72            "VERDICT: PROCEED — quote stripping HELPS CM. Typed encoding will likely help too."
73        );
74    } else {
75        println!();
76        println!(
77            "VERDICT: DUAL-PIPELINE — quote stripping HURTS CM. Use typed encoding + zstd (Fast mode only)."
78        );
79    }
80}