validation_ab_test/
validation_ab_test.rs1use datacortex_core::codec::compress_to_vec;
14use datacortex_core::dcx::{FormatHint, Mode};
15use std::fs;
16
17fn main() {
18 let data =
20 fs::read("corpus/test-ndjson.ndjson").expect("failed to read corpus/test-ndjson.ndjson");
21 println!("Original NDJSON: {} bytes", data.len());
22
23 let result = datacortex_core::format::ndjson::preprocess(&data)
25 .expect("ndjson preprocess failed — is the file valid uniform NDJSON?");
26 let columnar = result.data;
27 println!("Columnar data: {} bytes", columnar.len());
28
29 eprintln!("[A] Compressing columnar data (this takes a while)...");
32 let compressed_a = compress_to_vec(&columnar, Mode::Balanced, Some(FormatHint::Generic))
33 .expect("compress variant A");
34 let bpb_a = compressed_a.len() as f64 * 8.0 / data.len() as f64;
35
36 let stripped = strip_quotes(&columnar);
38 let quotes_removed = columnar.len() - stripped.len();
39 println!(
40 "Quote-stripped: {} bytes ({:.1}% of columnar, {} quote bytes removed)",
41 stripped.len(),
42 stripped.len() as f64 / columnar.len() as f64 * 100.0,
43 quotes_removed,
44 );
45
46 eprintln!("[B] Compressing quote-stripped data (this takes a while)...");
47 let compressed_b = compress_to_vec(&stripped, Mode::Balanced, Some(FormatHint::Generic))
48 .expect("compress variant B");
49 let bpb_b = compressed_b.len() as f64 * 8.0 / data.len() as f64;
50
51 println!();
53 println!("=== RESULTS ===");
54 println!(
55 "Variant A (columnar, no strip): {} bytes, {:.3} bpb",
56 compressed_a.len(),
57 bpb_a
58 );
59 println!(
60 "Variant B (quote-stripped): {} bytes, {:.3} bpb",
61 compressed_b.len(),
62 bpb_b
63 );
64
65 let improvement =
66 (compressed_a.len() as f64 - compressed_b.len() as f64) / compressed_a.len() as f64 * 100.0;
67 println!("Improvement: {:.1}%", improvement);
68
69 if compressed_b.len() < compressed_a.len() {
70 println!();
71 println!(
72 "VERDICT: PROCEED — quote stripping HELPS CM. Typed encoding will likely help too."
73 );
74 } else {
75 println!();
76 println!(
77 "VERDICT: DUAL-PIPELINE — quote stripping HURTS CM. Use typed encoding + zstd (Fast mode only)."
78 );
79 }
80}
81
82fn strip_quotes(data: &[u8]) -> Vec<u8> {
88 let mut out = Vec::with_capacity(data.len());
89 let mut val_start = 0;
90
91 for i in 0..=data.len() {
92 let is_sep = if i < data.len() {
94 data[i] == 0x00 || data[i] == 0x01
95 } else {
96 true
97 };
98
99 if is_sep {
100 let val = &data[val_start..i];
101 if val.len() >= 2 && val[0] == b'"' && val[val.len() - 1] == b'"' {
102 out.extend_from_slice(&val[1..val.len() - 1]);
104 } else {
105 out.extend_from_slice(val);
107 }
108 if i < data.len() {
110 out.push(data[i]);
111 }
112 val_start = i + 1;
113 }
114 }
115
116 out
117}
118
119#[cfg(test)]
120mod tests {
121 use super::*;
122
123 #[test]
124 fn strip_quotes_basic() {
125 let input = b"\"page_view\"\x01\"api_call\"\x01\"page_view\"";
127 let expected = b"page_view\x01api_call\x01page_view";
128 assert_eq!(strip_quotes(input), expected.to_vec());
129 }
130
131 #[test]
132 fn strip_quotes_mixed() {
133 let input = b"\"hello\"\x0142\x01true\x01null\x00\"world\"\x01false";
135 let expected = b"hello\x0142\x01true\x01null\x00world\x01false";
136 assert_eq!(strip_quotes(input), expected.to_vec());
137 }
138
139 #[test]
140 fn strip_quotes_empty_string() {
141 let input = b"\"\"\x01\"x\"";
143 let expected = b"\x01x";
144 assert_eq!(strip_quotes(input), expected.to_vec());
145 }
146
147 #[test]
148 fn strip_quotes_no_strings() {
149 let input = b"42\x01true\x00null\x01false";
150 assert_eq!(strip_quotes(input), input.to_vec());
151 }
152}