Skip to main content

validation_ab_test/
validation_ab_test.rs

1//! Validation A/B test: does quote stripping help CM compression?
2//!
3//! Reads corpus/test-ndjson.ndjson, applies NDJSON columnar transform,
4//! then compresses two variants with the CM engine (Balanced mode):
5//!   A = columnar data as-is (current behavior)
6//!   B = columnar data with JSON string quotes stripped
7//!
8//! If B is smaller, typed encoding will likely help CM => PROCEED.
9//! If B is larger, typed encoding should only be used with zstd => DUAL-PIPELINE.
10//!
11//! Usage: cargo run --release --example validation_ab_test
12
13use datacortex_core::codec::compress_to_vec;
14use datacortex_core::dcx::{FormatHint, Mode};
15use std::fs;
16
17fn main() {
18    // Read test file.
19    let data =
20        fs::read("corpus/test-ndjson.ndjson").expect("failed to read corpus/test-ndjson.ndjson");
21    println!("Original NDJSON: {} bytes", data.len());
22
23    // Apply NDJSON columnar transform (low-level, no value dict).
24    let result = datacortex_core::format::ndjson::preprocess(&data)
25        .expect("ndjson preprocess failed — is the file valid uniform NDJSON?");
26    let columnar = result.data;
27    println!("Columnar data:   {} bytes", columnar.len());
28
29    // --- Variant A: compress original columnar data with CM ---
30    // Use Generic format hint so compress_to_vec does NOT re-apply transforms.
31    eprintln!("[A] Compressing columnar data (this takes a while)...");
32    let compressed_a = compress_to_vec(&columnar, Mode::Balanced, Some(FormatHint::Generic))
33        .expect("compress variant A");
34    let bpb_a = compressed_a.len() as f64 * 8.0 / data.len() as f64;
35
36    // --- Variant B: strip quotes then compress ---
37    let stripped = strip_quotes(&columnar);
38    let quotes_removed = columnar.len() - stripped.len();
39    println!(
40        "Quote-stripped:   {} bytes ({:.1}% of columnar, {} quote bytes removed)",
41        stripped.len(),
42        stripped.len() as f64 / columnar.len() as f64 * 100.0,
43        quotes_removed,
44    );
45
46    eprintln!("[B] Compressing quote-stripped data (this takes a while)...");
47    let compressed_b = compress_to_vec(&stripped, Mode::Balanced, Some(FormatHint::Generic))
48        .expect("compress variant B");
49    let bpb_b = compressed_b.len() as f64 * 8.0 / data.len() as f64;
50
51    // --- Results ---
52    println!();
53    println!("=== RESULTS ===");
54    println!(
55        "Variant A (columnar, no strip): {} bytes, {:.3} bpb",
56        compressed_a.len(),
57        bpb_a
58    );
59    println!(
60        "Variant B (quote-stripped):      {} bytes, {:.3} bpb",
61        compressed_b.len(),
62        bpb_b
63    );
64
65    let improvement =
66        (compressed_a.len() as f64 - compressed_b.len() as f64) / compressed_a.len() as f64 * 100.0;
67    println!("Improvement: {:.1}%", improvement);
68
69    if compressed_b.len() < compressed_a.len() {
70        println!();
71        println!(
72            "VERDICT: PROCEED — quote stripping HELPS CM. Typed encoding will likely help too."
73        );
74    } else {
75        println!();
76        println!(
77            "VERDICT: DUAL-PIPELINE — quote stripping HURTS CM. Use typed encoding + zstd (Fast mode only)."
78        );
79    }
80}
81
82/// Strip leading and trailing `"` from each value in columnar data.
83///
84/// Columnar layout uses \x00 as column separator and \x01 as value separator.
85/// Values that start and end with `"` are JSON strings — we strip those quotes.
86/// Non-string values (numbers, booleans, null) are left as-is.
87fn strip_quotes(data: &[u8]) -> Vec<u8> {
88    let mut out = Vec::with_capacity(data.len());
89    let mut val_start = 0;
90
91    for i in 0..=data.len() {
92        // At a separator or end of data, process the value.
93        let is_sep = if i < data.len() {
94            data[i] == 0x00 || data[i] == 0x01
95        } else {
96            true
97        };
98
99        if is_sep {
100            let val = &data[val_start..i];
101            if val.len() >= 2 && val[0] == b'"' && val[val.len() - 1] == b'"' {
102                // Strip the surrounding quotes.
103                out.extend_from_slice(&val[1..val.len() - 1]);
104            } else {
105                // Keep as-is (number, bool, null, or too short).
106                out.extend_from_slice(val);
107            }
108            // Append the separator itself.
109            if i < data.len() {
110                out.push(data[i]);
111            }
112            val_start = i + 1;
113        }
114    }
115
116    out
117}
118
119#[cfg(test)]
120mod tests {
121    use super::*;
122
123    #[test]
124    fn strip_quotes_basic() {
125        // "page_view"\x01"api_call"\x01"page_view"
126        let input = b"\"page_view\"\x01\"api_call\"\x01\"page_view\"";
127        let expected = b"page_view\x01api_call\x01page_view";
128        assert_eq!(strip_quotes(input), expected.to_vec());
129    }
130
131    #[test]
132    fn strip_quotes_mixed() {
133        // "hello"\x0142\x01true\x01null\x00"world"\x01false
134        let input = b"\"hello\"\x0142\x01true\x01null\x00\"world\"\x01false";
135        let expected = b"hello\x0142\x01true\x01null\x00world\x01false";
136        assert_eq!(strip_quotes(input), expected.to_vec());
137    }
138
139    #[test]
140    fn strip_quotes_empty_string() {
141        // ""\x01"x"
142        let input = b"\"\"\x01\"x\"";
143        let expected = b"\x01x";
144        assert_eq!(strip_quotes(input), expected.to_vec());
145    }
146
147    #[test]
148    fn strip_quotes_no_strings() {
149        let input = b"42\x01true\x00null\x01false";
150        assert_eq!(strip_quotes(input), input.to_vec());
151    }
152}