pub fn preprocess(data: &[u8]) -> Option<TransformResult>Expand description
Forward transform: NDJSON columnar reorg.
Tries Strategy 1 (uniform) first, then Strategy 2 (grouped) if schemas differ. Returns None if data is not suitable for columnar transform.
Examples found in repository?
examples/validation_ab_test.rs (line 24)
17fn main() {
18 // Read test file.
19 let data =
20 fs::read("corpus/test-ndjson.ndjson").expect("failed to read corpus/test-ndjson.ndjson");
21 println!("Original NDJSON: {} bytes", data.len());
22
23 // Apply NDJSON columnar transform (low-level, no value dict).
24 let result = datacortex_core::format::ndjson::preprocess(&data)
25 .expect("ndjson preprocess failed — is the file valid uniform NDJSON?");
26 let columnar = result.data;
27 println!("Columnar data: {} bytes", columnar.len());
28
29 // --- Variant A: compress original columnar data with CM ---
30 // Use Generic format hint so compress_to_vec does NOT re-apply transforms.
31 eprintln!("[A] Compressing columnar data (this takes a while)...");
32 let compressed_a = compress_to_vec(&columnar, Mode::Balanced, Some(FormatHint::Generic))
33 .expect("compress variant A");
34 let bpb_a = compressed_a.len() as f64 * 8.0 / data.len() as f64;
35
36 // --- Variant B: strip quotes then compress ---
37 let stripped = strip_quotes(&columnar);
38 let quotes_removed = columnar.len() - stripped.len();
39 println!(
40 "Quote-stripped: {} bytes ({:.1}% of columnar, {} quote bytes removed)",
41 stripped.len(),
42 stripped.len() as f64 / columnar.len() as f64 * 100.0,
43 quotes_removed,
44 );
45
46 eprintln!("[B] Compressing quote-stripped data (this takes a while)...");
47 let compressed_b = compress_to_vec(&stripped, Mode::Balanced, Some(FormatHint::Generic))
48 .expect("compress variant B");
49 let bpb_b = compressed_b.len() as f64 * 8.0 / data.len() as f64;
50
51 // --- Results ---
52 println!();
53 println!("=== RESULTS ===");
54 println!(
55 "Variant A (columnar, no strip): {} bytes, {:.3} bpb",
56 compressed_a.len(),
57 bpb_a
58 );
59 println!(
60 "Variant B (quote-stripped): {} bytes, {:.3} bpb",
61 compressed_b.len(),
62 bpb_b
63 );
64
65 let improvement =
66 (compressed_a.len() as f64 - compressed_b.len() as f64) / compressed_a.len() as f64 * 100.0;
67 println!("Improvement: {:.1}%", improvement);
68
69 if compressed_b.len() < compressed_a.len() {
70 println!();
71 println!(
72 "VERDICT: PROCEED — quote stripping HELPS CM. Typed encoding will likely help too."
73 );
74 } else {
75 println!();
76 println!(
77 "VERDICT: DUAL-PIPELINE — quote stripping HURTS CM. Use typed encoding + zstd (Fast mode only)."
78 );
79 }
80}