Skip to main content

datacortex_core/format/
json.rs

1//! JSON key interning — replace repeated keys with short references.
2//!
3//! Forward: scan JSON, find key strings, replace with \x00 + index.
4//! Reverse: scan for \x00 markers, expand from key dictionary.
5//!
6//! Escape scheme (null byte cannot appear in valid JSON):
7//!   \x00 + idx (0..=252)     → key dictionary reference
8//!   \x00 + 0xFD + u16 LE    → extended key reference (253..65535)
9//!   \x00 + 0xFE              → literal null byte (escape)
10
11use super::transform::TransformResult;
12use std::collections::HashMap;
13
14const ESCAPE: u8 = 0x00;
15const ESCAPE_EXTENDED: u8 = 0xFD;
16const ESCAPE_LITERAL: u8 = 0xFE;
17const MAX_SHORT_INDEX: u8 = 0xFC;
18
19/// A key occurrence found during scanning.
20struct KeyOccurrence {
21    start: usize, // position of opening quote
22    end: usize,   // position after closing quote
23    content: Vec<u8>,
24}
25
26/// Scan JSON bytes and find all key string positions.
27/// A key is a quoted string followed by ':' (after optional whitespace).
28fn find_keys(data: &[u8]) -> Vec<KeyOccurrence> {
29    let mut keys = Vec::new();
30    let mut pos = 0;
31
32    while pos < data.len() {
33        if data[pos] == b'"' {
34            let start = pos;
35            pos += 1;
36            let mut content = Vec::new();
37            let mut escaped = false;
38
39            while pos < data.len() {
40                if escaped {
41                    content.push(data[pos]);
42                    escaped = false;
43                } else if data[pos] == b'\\' {
44                    content.push(data[pos]);
45                    escaped = true;
46                } else if data[pos] == b'"' {
47                    pos += 1;
48                    break;
49                } else {
50                    content.push(data[pos]);
51                }
52                pos += 1;
53            }
54
55            let end = pos;
56
57            // Check if followed by ':' (after optional whitespace) → key.
58            let mut check = pos;
59            while check < data.len() && data[check].is_ascii_whitespace() {
60                check += 1;
61            }
62            if check < data.len() && data[check] == b':' {
63                keys.push(KeyOccurrence {
64                    start,
65                    end,
66                    content,
67                });
68            }
69        } else {
70            pos += 1;
71        }
72    }
73
74    keys
75}
76
77/// Build frequency-sorted key dictionary. Only includes keys appearing > 1 time.
78fn build_dictionary(keys: &[KeyOccurrence]) -> Vec<Vec<u8>> {
79    let mut freq: HashMap<Vec<u8>, usize> = HashMap::new();
80    for k in keys {
81        *freq.entry(k.content.clone()).or_default() += 1;
82    }
83
84    let mut entries: Vec<(Vec<u8>, usize)> =
85        freq.into_iter().filter(|(_, count)| *count > 1).collect();
86
87    // Most frequent first = smallest index.
88    entries.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
89
90    entries.into_iter().map(|(k, _)| k).collect()
91}
92
93/// Forward transform: intern repeated JSON keys.
94pub fn preprocess(data: &[u8]) -> Option<TransformResult> {
95    let keys = find_keys(data);
96    if keys.is_empty() {
97        return None;
98    }
99
100    let dict = build_dictionary(&keys);
101    if dict.is_empty() {
102        return None;
103    }
104
105    let lookup: HashMap<&[u8], usize> = dict
106        .iter()
107        .enumerate()
108        .map(|(i, k)| (k.as_slice(), i))
109        .collect();
110
111    let mut output = Vec::with_capacity(data.len());
112    let mut last_end = 0;
113
114    for key in &keys {
115        if let Some(&idx) = lookup.get(key.content.as_slice()) {
116            // Copy bytes before this key, escaping null bytes.
117            escape_copy(&data[last_end..key.start], &mut output);
118
119            // Write key reference.
120            output.push(ESCAPE);
121            if idx <= MAX_SHORT_INDEX as usize {
122                output.push(idx as u8);
123            } else {
124                output.push(ESCAPE_EXTENDED);
125                output.extend_from_slice(&(idx as u16).to_le_bytes());
126            }
127
128            last_end = key.end;
129        } else {
130            // Key not in dictionary — copy verbatim with null escaping.
131            escape_copy(&data[last_end..key.end], &mut output);
132            last_end = key.end;
133        }
134    }
135
136    // Copy remaining bytes.
137    escape_copy(&data[last_end..], &mut output);
138
139    // Only apply if preprocessed data is smaller (metadata stored separately in header).
140    let metadata = serialize_dict(&dict);
141    if output.len() >= data.len() {
142        return None;
143    }
144
145    Some(TransformResult {
146        data: output,
147        metadata,
148    })
149}
150
151/// Reverse transform: expand key references back to original strings.
152pub fn reverse(data: &[u8], metadata: &[u8]) -> Vec<u8> {
153    let dict = deserialize_dict(metadata);
154    let mut output = Vec::with_capacity(data.len() * 2);
155    let mut pos = 0;
156
157    while pos < data.len() {
158        if data[pos] == ESCAPE {
159            pos += 1;
160            if pos >= data.len() {
161                break;
162            }
163            match data[pos] {
164                ESCAPE_LITERAL => {
165                    output.push(ESCAPE);
166                    pos += 1;
167                }
168                ESCAPE_EXTENDED => {
169                    pos += 1;
170                    if pos + 2 <= data.len() {
171                        let idx = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
172                        pos += 2;
173                        if idx < dict.len() {
174                            output.push(b'"');
175                            output.extend_from_slice(&dict[idx]);
176                            output.push(b'"');
177                        }
178                    }
179                }
180                idx if idx <= MAX_SHORT_INDEX => {
181                    let idx = idx as usize;
182                    if idx < dict.len() {
183                        output.push(b'"');
184                        output.extend_from_slice(&dict[idx]);
185                        output.push(b'"');
186                    }
187                    pos += 1;
188                }
189                _ => {
190                    // Unknown escape — pass through.
191                    pos += 1;
192                }
193            }
194        } else {
195            output.push(data[pos]);
196            pos += 1;
197        }
198    }
199
200    output
201}
202
203/// Copy bytes, escaping null bytes as \x00\xFE.
204fn escape_copy(src: &[u8], dst: &mut Vec<u8>) {
205    for &b in src {
206        if b == ESCAPE {
207            dst.push(ESCAPE);
208            dst.push(ESCAPE_LITERAL);
209        } else {
210            dst.push(b);
211        }
212    }
213}
214
215fn serialize_dict(dict: &[Vec<u8>]) -> Vec<u8> {
216    let mut out = Vec::new();
217    out.push(1); // version
218    out.extend_from_slice(&(dict.len() as u16).to_le_bytes());
219    for key in dict {
220        out.extend_from_slice(&(key.len() as u16).to_le_bytes());
221        out.extend_from_slice(key);
222    }
223    out
224}
225
226fn deserialize_dict(data: &[u8]) -> Vec<Vec<u8>> {
227    if data.len() < 3 {
228        return vec![];
229    }
230    let mut pos = 0;
231    let _version = data[pos];
232    pos += 1;
233    let num = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
234    pos += 2;
235    let mut dict = Vec::with_capacity(num);
236    for _ in 0..num {
237        if pos + 2 > data.len() {
238            break;
239        }
240        let len = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
241        pos += 2;
242        if pos + len > data.len() {
243            break;
244        }
245        dict.push(data[pos..pos + len].to_vec());
246        pos += len;
247    }
248    dict
249}
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254
255    #[test]
256    fn find_keys_simple() {
257        let data = br#"{"name": "Alice", "age": 30}"#;
258        let keys = find_keys(data);
259        assert_eq!(keys.len(), 2);
260        assert_eq!(keys[0].content, b"name");
261        assert_eq!(keys[1].content, b"age");
262    }
263
264    #[test]
265    fn find_keys_nested() {
266        let data = br#"{"a": {"b": 1, "c": 2}, "a": {"b": 3}}"#;
267        let keys = find_keys(data);
268        // Keys: "a", "b", "c", "a", "b"
269        assert_eq!(keys.len(), 5);
270    }
271
272    #[test]
273    fn find_keys_escaped_quotes() {
274        let data = br#"{"key\"name": "val"}"#;
275        let keys = find_keys(data);
276        assert_eq!(keys.len(), 1);
277        assert_eq!(keys[0].content, br#"key\"name"#.to_vec());
278    }
279
280    #[test]
281    fn roundtrip_simple() {
282        let data = br#"{"name": "Alice", "age": 30, "name": "Bob", "age": 25}"#;
283        let result = preprocess(data).expect("should produce transform");
284        let restored = reverse(&result.data, &result.metadata);
285        assert_eq!(restored, data.to_vec());
286    }
287
288    #[test]
289    fn roundtrip_nested() {
290        let data = br#"{"id": 1, "data": {"id": 2, "type": "x"}, "id": 3, "type": "y"}"#;
291        let result = preprocess(data).expect("should produce transform");
292        let restored = reverse(&result.data, &result.metadata);
293        assert_eq!(restored, data.to_vec());
294    }
295
296    #[test]
297    fn roundtrip_ndjson_lines() {
298        let data = br#"{"ts":"a","val":1}
299{"ts":"b","val":2}
300{"ts":"c","val":3}
301"#;
302        let result = preprocess(data).expect("should produce transform");
303        let restored = reverse(&result.data, &result.metadata);
304        assert_eq!(restored, data.to_vec());
305    }
306
307    #[test]
308    fn no_transform_unique_keys() {
309        let data = br#"{"a": 1, "b": 2, "c": 3}"#;
310        assert!(
311            preprocess(data).is_none(),
312            "unique keys should not be interned"
313        );
314    }
315
316    #[test]
317    fn dict_roundtrip() {
318        let dict = vec![b"name".to_vec(), b"age".to_vec(), b"city".to_vec()];
319        let serialized = serialize_dict(&dict);
320        let deserialized = deserialize_dict(&serialized);
321        assert_eq!(deserialized, dict);
322    }
323
324    #[test]
325    fn size_reduction() {
326        let data = br#"{"name":"Alice","age":30,"name":"Bob","age":25,"name":"Carol","age":35}"#;
327        let result = preprocess(data).expect("should produce transform");
328        // Interned data should be smaller than original.
329        assert!(
330            result.data.len() + result.metadata.len() < data.len(),
331            "interned={} + meta={} should be < original={}",
332            result.data.len(),
333            result.metadata.len(),
334            data.len()
335        );
336    }
337}