Skip to main content

datacortex_core/format/
json.rs

1//! JSON key interning — replace repeated keys with short references.
2//!
3//! Forward: scan JSON, find key strings, replace with \x00 + index.
4//! Reverse: scan for \x00 markers, expand from key dictionary.
5//!
6//! Escape scheme (null byte cannot appear in valid JSON):
7//!   \x00 + idx (0..=252)     → key dictionary reference
8//!   \x00 + 0xFD + u16 LE    → extended key reference (253..65535)
9//!   \x00 + 0xFE              → literal null byte (escape)
10
11use std::collections::HashMap;
12
13use super::transform::TransformResult;
14
15const ESCAPE: u8 = 0x00;
16const ESCAPE_EXTENDED: u8 = 0xFD;
17const ESCAPE_LITERAL: u8 = 0xFE;
18const MAX_SHORT_INDEX: u8 = 0xFC;
19
20/// A key occurrence found during scanning.
21struct KeyOccurrence {
22    start: usize, // position of opening quote
23    end: usize,   // position after closing quote
24    content: Vec<u8>,
25}
26
27/// Scan JSON bytes and find all key string positions.
28/// A key is a quoted string followed by ':' (after optional whitespace).
29fn find_keys(data: &[u8]) -> Vec<KeyOccurrence> {
30    let mut keys = Vec::new();
31    let mut pos = 0;
32
33    while pos < data.len() {
34        if data[pos] == b'"' {
35            let start = pos;
36            pos += 1;
37            let mut content = Vec::new();
38            let mut escaped = false;
39
40            while pos < data.len() {
41                if escaped {
42                    content.push(data[pos]);
43                    escaped = false;
44                } else if data[pos] == b'\\' {
45                    content.push(data[pos]);
46                    escaped = true;
47                } else if data[pos] == b'"' {
48                    pos += 1;
49                    break;
50                } else {
51                    content.push(data[pos]);
52                }
53                pos += 1;
54            }
55
56            let end = pos;
57
58            // Check if followed by ':' (after optional whitespace) → key.
59            let mut check = pos;
60            while check < data.len() && data[check].is_ascii_whitespace() {
61                check += 1;
62            }
63            if check < data.len() && data[check] == b':' {
64                keys.push(KeyOccurrence {
65                    start,
66                    end,
67                    content,
68                });
69            }
70        } else {
71            pos += 1;
72        }
73    }
74
75    keys
76}
77
78/// Build frequency-sorted key dictionary. Only includes keys appearing > 1 time.
79fn build_dictionary(keys: &[KeyOccurrence]) -> Vec<Vec<u8>> {
80    let mut freq: HashMap<Vec<u8>, usize> = HashMap::new();
81    for k in keys {
82        *freq.entry(k.content.clone()).or_default() += 1;
83    }
84
85    let mut entries: Vec<(Vec<u8>, usize)> =
86        freq.into_iter().filter(|(_, count)| *count > 1).collect();
87
88    // Most frequent first = smallest index.
89    entries.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
90
91    entries.into_iter().map(|(k, _)| k).collect()
92}
93
94/// Forward transform: intern repeated JSON keys.
95pub fn preprocess(data: &[u8]) -> Option<TransformResult> {
96    let keys = find_keys(data);
97    if keys.is_empty() {
98        return None;
99    }
100
101    let dict = build_dictionary(&keys);
102    if dict.is_empty() {
103        return None;
104    }
105
106    let lookup: HashMap<&[u8], usize> = dict
107        .iter()
108        .enumerate()
109        .map(|(i, k)| (k.as_slice(), i))
110        .collect();
111
112    let mut output = Vec::with_capacity(data.len());
113    let mut last_end = 0;
114
115    for key in &keys {
116        if let Some(&idx) = lookup.get(key.content.as_slice()) {
117            // Copy bytes before this key, escaping null bytes.
118            escape_copy(&data[last_end..key.start], &mut output);
119
120            // Write key reference.
121            output.push(ESCAPE);
122            if idx <= MAX_SHORT_INDEX as usize {
123                output.push(idx as u8);
124            } else {
125                output.push(ESCAPE_EXTENDED);
126                output.extend_from_slice(&(idx as u16).to_le_bytes());
127            }
128
129            last_end = key.end;
130        } else {
131            // Key not in dictionary — copy verbatim with null escaping.
132            escape_copy(&data[last_end..key.end], &mut output);
133            last_end = key.end;
134        }
135    }
136
137    // Copy remaining bytes.
138    escape_copy(&data[last_end..], &mut output);
139
140    // Only apply if preprocessed data is smaller (metadata stored separately in header).
141    let metadata = serialize_dict(&dict);
142    if output.len() >= data.len() {
143        return None;
144    }
145
146    Some(TransformResult {
147        data: output,
148        metadata,
149    })
150}
151
152/// Reverse transform: expand key references back to original strings.
153pub fn reverse(data: &[u8], metadata: &[u8]) -> Vec<u8> {
154    let dict = deserialize_dict(metadata);
155    let mut output = Vec::with_capacity(data.len() * 2);
156    let mut pos = 0;
157
158    while pos < data.len() {
159        if data[pos] == ESCAPE {
160            pos += 1;
161            if pos >= data.len() {
162                break;
163            }
164            match data[pos] {
165                ESCAPE_LITERAL => {
166                    output.push(ESCAPE);
167                    pos += 1;
168                }
169                ESCAPE_EXTENDED => {
170                    pos += 1;
171                    if pos + 2 <= data.len() {
172                        let idx = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
173                        pos += 2;
174                        if idx < dict.len() {
175                            output.push(b'"');
176                            output.extend_from_slice(&dict[idx]);
177                            output.push(b'"');
178                        }
179                    }
180                }
181                idx if idx <= MAX_SHORT_INDEX => {
182                    let idx = idx as usize;
183                    if idx < dict.len() {
184                        output.push(b'"');
185                        output.extend_from_slice(&dict[idx]);
186                        output.push(b'"');
187                    }
188                    pos += 1;
189                }
190                _ => {
191                    // Unknown escape — pass through.
192                    pos += 1;
193                }
194            }
195        } else {
196            output.push(data[pos]);
197            pos += 1;
198        }
199    }
200
201    output
202}
203
204/// Copy bytes, escaping null bytes as \x00\xFE.
205fn escape_copy(src: &[u8], dst: &mut Vec<u8>) {
206    for &b in src {
207        if b == ESCAPE {
208            dst.push(ESCAPE);
209            dst.push(ESCAPE_LITERAL);
210        } else {
211            dst.push(b);
212        }
213    }
214}
215
216fn serialize_dict(dict: &[Vec<u8>]) -> Vec<u8> {
217    let mut out = Vec::new();
218    out.push(1); // version
219    out.extend_from_slice(&(dict.len() as u16).to_le_bytes());
220    for key in dict {
221        out.extend_from_slice(&(key.len() as u16).to_le_bytes());
222        out.extend_from_slice(key);
223    }
224    out
225}
226
227fn deserialize_dict(data: &[u8]) -> Vec<Vec<u8>> {
228    if data.len() < 3 {
229        return vec![];
230    }
231    let mut pos = 0;
232    // Skip version byte (reserved for format changes).
233    pos += 1;
234    let num = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
235    pos += 2;
236    let mut dict = Vec::with_capacity(num);
237    for _ in 0..num {
238        if pos + 2 > data.len() {
239            break;
240        }
241        let len = u16::from_le_bytes([data[pos], data[pos + 1]]) as usize;
242        pos += 2;
243        if pos + len > data.len() {
244            break;
245        }
246        dict.push(data[pos..pos + len].to_vec());
247        pos += len;
248    }
249    dict
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255
256    #[test]
257    fn find_keys_simple() {
258        let data = br#"{"name": "Alice", "age": 30}"#;
259        let keys = find_keys(data);
260        assert_eq!(keys.len(), 2);
261        assert_eq!(keys[0].content, b"name");
262        assert_eq!(keys[1].content, b"age");
263    }
264
265    #[test]
266    fn find_keys_nested() {
267        let data = br#"{"a": {"b": 1, "c": 2}, "a": {"b": 3}}"#;
268        let keys = find_keys(data);
269        // Keys: "a", "b", "c", "a", "b"
270        assert_eq!(keys.len(), 5);
271    }
272
273    #[test]
274    fn find_keys_escaped_quotes() {
275        let data = br#"{"key\"name": "val"}"#;
276        let keys = find_keys(data);
277        assert_eq!(keys.len(), 1);
278        assert_eq!(keys[0].content, br#"key\"name"#.to_vec());
279    }
280
281    #[test]
282    fn roundtrip_simple() {
283        let data = br#"{"name": "Alice", "age": 30, "name": "Bob", "age": 25}"#;
284        let result = preprocess(data).expect("should produce transform");
285        let restored = reverse(&result.data, &result.metadata);
286        assert_eq!(restored, data.to_vec());
287    }
288
289    #[test]
290    fn roundtrip_nested() {
291        let data = br#"{"id": 1, "data": {"id": 2, "type": "x"}, "id": 3, "type": "y"}"#;
292        let result = preprocess(data).expect("should produce transform");
293        let restored = reverse(&result.data, &result.metadata);
294        assert_eq!(restored, data.to_vec());
295    }
296
297    #[test]
298    fn roundtrip_ndjson_lines() {
299        let data = br#"{"ts":"a","val":1}
300{"ts":"b","val":2}
301{"ts":"c","val":3}
302"#;
303        let result = preprocess(data).expect("should produce transform");
304        let restored = reverse(&result.data, &result.metadata);
305        assert_eq!(restored, data.to_vec());
306    }
307
308    #[test]
309    fn no_transform_unique_keys() {
310        let data = br#"{"a": 1, "b": 2, "c": 3}"#;
311        assert!(
312            preprocess(data).is_none(),
313            "unique keys should not be interned"
314        );
315    }
316
317    #[test]
318    fn dict_roundtrip() {
319        let dict = vec![b"name".to_vec(), b"age".to_vec(), b"city".to_vec()];
320        let serialized = serialize_dict(&dict);
321        let deserialized = deserialize_dict(&serialized);
322        assert_eq!(deserialized, dict);
323    }
324
325    #[test]
326    fn size_reduction() {
327        let data = br#"{"name":"Alice","age":30,"name":"Bob","age":25,"name":"Carol","age":35}"#;
328        let result = preprocess(data).expect("should produce transform");
329        // Interned data should be smaller than original.
330        assert!(
331            result.data.len() + result.metadata.len() < data.len(),
332            "interned={} + meta={} should be < original={}",
333            result.data.len(),
334            result.metadata.len(),
335            data.len()
336        );
337    }
338}